Rebecca DiBari

Homework 1

data

mydata = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_precip.dat"), 
    ncol = 4, byrow = T)
lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]
X = as.data.frame(cbind(lat, long, elevation))  #predictor variables

Libraries

library(MPV)
library(leaps)
library(MASS)
library(akima)
library(fields)
library(fitdistrplus)
library(sm)
library(locfit)
library(stats)
library(LatticeKrig)

Sourced functions

# functions to produce boxplots with whiskers at 5th and 95th percentile
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/myboxplot.r")
source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/myboxplot-stats.r")

thebest = function(X, Y, deg, family) {
    nvar = length(X[1, ])
    N = length(precip)

    minalpha = 2 * (nvar * deg + 1)/N
    alpha_grid = seq(minalpha, 1, by = 0.05)

    zz = gcvplot(precip ~ X, alpha = alpha_grid, deg = deg, kern = "bisq", ev = dat(), 
        scale = TRUE, family = family)

    mygcv = sort(zz$values)[1]
    # return(mygcv)

    alpha = alpha_grid
    # pick the best alpha from the best family that gives the least GCV
    z6 = order(zz$values)[1]
    bestalpha = alpha[z6]
    # return(bestalpha)

    myresult = list(gcv = mygcv, bestalpha = bestalpha, family = family)
}

Question 1

(a) Fit a best linear regression model


# Find the best model using PRESS

# all combinations
combs = leaps(X, precip)
combos = combs$which
ncombos = length(combos[, 1])
xpress = 1:ncombos

for (i in 1:ncombos) {
    zz = lm(precip ~ as.matrix(X[, combos[i, ]]))
    xpress[i] = PRESS(zz)
}

# best model based on PRESS
zc = order(xpress)[1]
# indicates which variables are present in the best model
combos[zc, ]

##     1     2     3 
## FALSE  TRUE FALSE


# best model omits lat and elevation

# Compare with stepAIC
zz = lm(precip ~ elevation + lat + long)
bestmodel = stepAIC(zz, trace = FALSE)

(b) Perform ANOVA and model Diagnostics

n = length(precip)

par(mfrow = c(2, 2))

# Model Diagnostics: Check normality, autocorrelation and homoskedasticity
# of residuals
error = resid(bestmodel)
Y_hat = precip - error

hist(error, probability = T)
# fit and plot a normal distribution to the residuals..
lines(error[order(error)], dnorm(error[order(error)], mean = mean(error), sd = sd(error)), 
    col = "red")
# nonparametric PDF
sm.density(error, add = T, col = "blue")
legend("topright", legend = c("mean error", "non par"), lty = 1, lwd = c(1, 
    1), col = c("red", "blue"))

# Homoskedastic: residuals versus y_hat
plot(Y_hat, error)
abline(h = 0, col = "red")
title("Homoskedasticity")

# Check normality of residuals
qqnorm(error, ylab = "Standardized Residuals", xlab = "Normal Scores", main = "Normal QQPlot of Residuals")
qqline(error, col = "red")

# Autocorrelation test
acf(error)

# ANOVA
bestmodel$anova

## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## precip ~ elevation + lat + long
## 
## Final Model:
## precip ~ long
## 
## 
##          Step Df Deviance Resid. Df Resid. Dev  AIC
## 1                               487   26328826 5355
## 2 - elevation  1    16565       488   26345391 5353
## 3       - lat  1    24373       489   26369763 5352

# p value= 2.8 X 10^-9 Regression is sifnficant if p-value less than alpha
# regression is significant

plot of chunk unnamed-chunk-5

c) Compute cross-validated and fitted estimates at each observation points and plot them against the observed values.

# get the cross validated estimates..
n = length(precip)

y_hat = 1:n
index = 1:n
for (i in 1:n) {
    index1 = index[index != i]
    Xval = long[index1]
    Yval = precip[index1]

    zz = lsfit(Xval, Yval)

    # now estimate at the point that was dropped
    xpred = c(1, long[i])
    y_hat[i] = sum(zz$coef * xpred)
}

par(mfrow = c(1, 1))
plot(precip, y_hat, ylab = "estimated values")

plot of chunk unnamed-chunk-6

(d) Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
n = length(precip)
nsim = 500

# initialize vectors
rmse = 1:nsim
rsq = 1:nsim
index = 1:n
N10 = round(0.1 * n)

for (i in 1:nsim) {
    drop = sample(c(1:n), N10)
    keep = setdiff(index, drop)

    xx = long[keep]
    y = precip[keep]
    zz = lm(y ~ xx)

    xx = long[drop]
    yhat = predict(zz, newdata = as.data.frame(xx), type = "response")

    resid = precip[drop] - yhat
    rmse[i] = sqrt(mean((precip[drop] - yhat)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - yhat)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse

zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted\nvalues of the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted\nvalues of the 10% dropped")

plot of chunk unnamed-chunk-7

(e) Estimate the precipitation and the standard errors on the DEM grid and show them as 3-D plots

## Now we wish to estimate/predict at the points on the DEM
mydata = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_precip.dat"), 
    ncol = 4, byrow = T)
lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]

zz = lm(precip ~ elevation + lat + long)
z1 = stepAIC(zz)

## Start:  AIC=5355
## precip ~ elevation + lat + long
## 
##             Df Sum of Sq      RSS  AIC
## - elevation  1     16565 26345391 5353
## - lat        1     37323 26366148 5354
## <none>                   26328826 5355
## - long       1   1843395 28172221 5386
## 
## Step:  AIC=5353
## precip ~ lat + long
## 
##        Df Sum of Sq      RSS  AIC
## - lat   1     24373 26369763 5352
## <none>              26345391 5353
## - long  1   1996901 28342292 5387
## 
## Step:  AIC=5352
## precip ~ long
## 
##        Df Sum of Sq      RSS  AIC
## <none>              26369763 5352
## - long  1   1977078 28346841 5385


predpoints = read.table("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat", 
    sep = "")
Xpred = cbind(predpoints[, 1], predpoints[, 2], predpoints[, 3])
X = as.data.frame(Xpred)
names(X) = c("lat", "long", "elevation")

ypred = predict.lm(z1, newdata = X, type = "response", se.fit = TRUE)

lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))

par(mfrow = c(1, 2))

ypred.mat = matrix(ypred$fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat), main = "Estimated Precipitation")
contour(long1, lat1, t(ypred.mat), add = T)

ypred.mat = matrix(ypred$se.fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat), main = "Estimated Error")
contour(long1, lat1, t(ypred.mat), add = T)

plot of chunk unnamed-chunk-8

Question 2

(a)Fit a best generalized linear model

lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]
X = as.matrix(cbind(lat, long, elevation))  #predictor variables

# To fit glm, need to know what family to chose

xeval = seq(max(0, min(precip) - sd(precip)), max(precip) + sd(precip), length = length(precip))
# fit gamma and weibull using fitdistrplus
zgamma = fitdist(precip, "gamma")
options(warn = -1)
zweibull = fitdist(precip, "weibull")
options(warn = -1)

# fit other distributions
fitnorm = dnorm(xeval, mean = mean(precip), sd = sd(precip))
fitlnorm = dlnorm(xeval, meanlog = mean(log(precip)), sdlog = sd(log(precip)))
fitexp = dexp(xeval, rate = (1/mean(precip)))

par(mfrow = c(1, 1))
# plot hist and overlay
hist(precip, probability = TRUE, ylim = c(0, 0.005))
lines(xeval, fitlnorm, col = "orange")
lines(xeval, fitnorm, col = "red")
lines(xeval, fitexp, col = "green")
lines(xeval, dgamma(xeval, shape = zgamma$estimate[1], scale = 1/zgamma$estimate[2]), 
    col = "blue")
lines(xeval, dweibull(xeval, shape = zweibull$estimate[1], scale = zweibull$estimate[2]), 
    col = "purple")
legend("topright", legend = c("lognormal", "normal", "exp", "gamma", "weibull"), 
    lty = 1, lwd = 1, col = c("orange", "red", "green", "blue", "purple"))

plot of chunk unnamed-chunk-9


# Fit model with gamma family; find best link function and model using AIC
X = as.data.frame(X)
zz = glm(precip ~ ., data = X, family = Gamma(link = "inverse"))
bestmodel = stepAIC(zz, trace = F)

X = as.data.frame(X)
zz = glm(precip ~ ., data = X, family = Gamma(link = "log"))
bestmodel = stepAIC(zz, trace = F)

X = as.data.frame(X)
zz = glm(precip ~ ., data = X, family = Gamma(link = "identity"))
bestmodel = stepAIC(zz, trace = F)
# identity is the best link function; it has the lowest AIC

(b) Perform ANOVA and model Diagnostics

n = length(precip)

par(mfrow = c(2, 2))

# Model Diagnostics: Check normality, autocorrelation and homoskedasticity
# of residuals
error = resid(bestmodel)
Y_hat = precip - error

hist(error, probability = T)
# fit and plot a normal distribution to the residuals..
lines(error[order(error)], dnorm(error[order(error)], mean = mean(error), sd = sd(error)), 
    col = "red")
# nonparametric PDF
sm.density(error, add = T, col = "blue")
legend("topright", legend = c("mean error", "non par"), lty = 1, lwd = c(1, 
    1), col = c("red", "blue"))

# Homoskedastic: residuals versus y_hat
plot(Y_hat, error)
abline(h = 0, col = "red")
title("Homoskedasticity")

# Check normality of residuals
qqnorm(error, ylab = "Standardized Residuals", xlab = "Normal Scores", main = "Normal QQPlot of Residuals")
qqline(error, col = "red")

# Autocorrelation test
acf(error)

plot of chunk unnamed-chunk-10


# ANOVA
summary(bestmodel)

## 
## Call:
## glm(formula = precip ~ long, family = Gamma(link = "identity"), 
##     data = X)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9874  -0.3208  -0.0687   0.2125   1.3466  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3888.40     554.64   -7.01  7.9e-12 ***
## long          -41.85       5.27   -7.94  1.4e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## (Dispersion parameter for Gamma family taken to be 0.1633)
## 
##     Null deviance: 87.027  on 490  degrees of freedom
## Residual deviance: 78.531  on 489  degrees of freedom
## AIC: 6596
## 
## Number of Fisher Scoring iterations: 6

c) Compute cross-validated and fitted estimates at each observation points and plot them against the observed values.

# get the cross validated estimates..

n = length(precip)

yhat = 1:n
index = 1:n
for (i in 1:n) {
    index1 = index[index != i]
    Xval = long[index1]
    Yval = precip[index1]

    zz = glm(Yval ~ Xval, family = Gamma(link = "identity"))

    # now estimate at the point that was dropped
    xpred = c(1, long[i])
    yhat[i] = sum(zz$coef * xpred)
}

par(mfrow = c(2, 1))
plot(precip, yhat, ylab = "cross validated estimates")
plot(precip, Y_hat, ylab = "fitted estimates")

plot of chunk unnamed-chunk-11

(d) Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
n = length(precip)
nsim = 500

# initialize vectors
rmse = 1:nsim
rsq = 1:nsim
index = 1:n
N10 = round(0.1 * n)

for (i in 1:nsim) {
    drop = sample(c(1:n), N10)
    keep = setdiff(index, drop)

    xx = long[keep]
    y = precip[keep]
    xx = as.data.frame(xx)
    zz = glm(y ~ ., data = xx, family = Gamma(link = "identity"))

    xx = long[drop]
    yhat = predict(zz, newdata = as.data.frame(xx), type = "response")

    resid = precip[drop] - yhat
    rmse[i] = sqrt(mean((precip[drop] - yhat)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - yhat)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse
zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted values\nof the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted values\nof the 10% dropped")

plot of chunk unnamed-chunk-12

(e) Estimate the precipitation and the standard errors on the DEM grid and show them as 3-D plots as in (v) above

X = as.data.frame(X)
zz = glm(precip ~ ., data = X, family = Gamma(link = "log"))
bestmodel = stepAIC(zz)

## Start:  AIC=6604
## precip ~ lat + long + elevation
## 
##             Df Deviance  AIC
## - elevation  1     79.1 6602
## - lat        1     79.2 6603
## <none>             79.0 6604
## - long       1     86.4 6646
## 
## Step:  AIC=6602
## precip ~ lat + long
## 
##        Df Deviance  AIC
## - lat   1     79.2 6601
## <none>        79.1 6602
## - long  1     87.0 6648
## 
## Step:  AIC=6601
## precip ~ long
## 
##        Df Deviance  AIC
## <none>        79.2 6601
## - long  1     87.0 6646


predpoints = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat"), 
    ncol = 3, byrow = T)
X = as.data.frame(predpoints)
names(X) = c("lat", "long", "elevation")

lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))
ypred = predict.glm(bestmodel, newdata = X, type = "response", se.fit = TRUE)

par(mfrow = c(1, 2))

ypred.mat = matrix(ypred$fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat))
contour(long1, lat1, t(ypred.mat), add = T)

ypred.mat = matrix(ypred$se.fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat))
contour(long1, lat1, t(ypred.mat), add = T)

plot of chunk unnamed-chunk-13

Question 3

data

Xf = matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), nrow = 10, ncol = 1, byrow = T)
Yf = matrix(c(-1.4, 2.45, 1.45, 5.38, 5.6, 5.99, 8.1, 7.54, 8.24, 10.8), nrow = 10, 
    ncol = 1, byrow = T)
N = length(Yf)
alpha = 0.5
K = round(alpha * N)
L = matrix(0, ncol = N, nrow = N)
yest = as.numeric(1:N)

(i) Evaluate the L matrix;

# by formula
for (i in 1:N) {
    # the point at which you want the estimation.
    xp = Xf[i, ]
    xx = rbind(xp, Xf)
    xx = as.matrix(dist(xx))
    nxx = length(xx[1, ])
    xdist = xx[1, 2:nxx]

    xorder = order(xdist)
    kdist = xdist[xorder[K]]  #distance to Kth nearest neighbor
    kdist = xdist[xdist >= kdist]  #make all distances greater than distk
    # get the weights
    weights = 15 * ((1 - ((xdist/kdist) * (xdist/kdist)))^2)/16

    weights = weights/sum(weights)

    # create a diagonal matrix with these weights..
    W = diag(weights)  #points outside of the K neighbors gets a weight of 0

    # add the first column to be 1 to get
    ax = t(t(Xf) - xp)
    xk = cbind(rep(1, N), ax)
    yk = Yf

    evect = rep(0, length(Xf[1]) + 1)  #unit vector
    evect[1] = 1
    L[i, ] = evect %*% (solve(t(xk) %*% W %*% xk)) %*% (t(xk) %*% W)
    yest[i] = sum(L[, i] * Yf)
}
L

##         [,1]   [,2]   [,3]   [,4]    [,5]    [,6]     [,7]     [,8]
##  [1,] 0.4114 0.3171 0.2187 0.1396 0.08000 0.03582  0.01196 -0.05495
##  [2,] 0.2698 0.2900 0.2198 0.1478 0.09034 0.04729  0.10301 -0.02952
##  [3,] 0.0000 0.1566 0.2385 0.1745 0.11197 0.06683  0.03579  1.07571
##  [4,] 0.0000 0.0000 0.1919 0.2903 0.21044 0.13307  0.07747  0.05256
##  [5,] 0.0000 0.0000 0.0000 0.2526 0.37345 0.26184  0.15691  0.02811
##  [6,] 0.0000 0.0000 0.0000 0.0000 0.29046 0.39678  0.24363  0.11120
##  [7,] 0.0000 0.0000 0.0000 0.0000 0.00000 0.31579  0.40352  0.23219
##  [8,] 0.0000 0.0000 0.0000 0.0000 0.00000 0.00000  0.32206  0.40484
##  [9,] 0.0000 0.0000 0.0000 0.0000 0.00000 0.00000  0.09296  0.20246
## [10,] 0.0000 0.0000 0.0000 0.0000 0.00000 0.00000 -0.17982  0.07674
##            [,9]    [,10]
##  [1,] -0.076772 -0.08282
##  [2,] -0.064856 -0.07372
##  [3,] -0.054998 -0.80487
##  [4,]  0.993022 -0.94874
##  [5,]  0.043146 -0.11601
##  [6,]  0.007276 -0.04935
##  [7,]  0.061929 -0.01342
##  [8,]  0.224129  0.04897
##  [9,]  0.316201  0.38838
## [10,]  0.385964  0.71711


# compare with locfit
N = length(Yf)
L1 = matrix(0, nrow = N, ncol = N)
X = as.matrix(Xf)

for (i in 1:N) {
    L1[i, ] = locfit(Yf ~ Xf, alpha = 0.5, deg = 1, geth = 1, kern = "bisq", 
        scale = TRUE, ev = Xf[i, ])
}
L1

##         [,1]   [,2]     [,3]     [,4]   [,5]   [,6]     [,7]     [,8]
##  [1,] 0.7919 0.3261 -0.02797 -0.09007 0.0000 0.0000  0.00000  0.00000
##  [2,] 0.3499 0.3632  0.22402  0.06293 0.0000 0.0000  0.00000  0.00000
##  [3,] 0.0000 0.2647  0.47059  0.26471 0.0000 0.0000  0.00000  0.00000
##  [4,] 0.0000 0.0000  0.26471  0.47059 0.2647 0.0000  0.00000  0.00000
##  [5,] 0.0000 0.0000  0.00000  0.26471 0.4706 0.2647  0.00000  0.00000
##  [6,] 0.0000 0.0000  0.00000  0.00000 0.2647 0.4706  0.26471  0.00000
##  [7,] 0.0000 0.0000  0.00000  0.00000 0.0000 0.2647  0.47059  0.26471
##  [8,] 0.0000 0.0000  0.00000  0.00000 0.0000 0.0000  0.26471  0.47059
##  [9,] 0.0000 0.0000  0.00000  0.00000 0.0000 0.0000  0.06293  0.22402
## [10,] 0.0000 0.0000  0.00000  0.00000 0.0000 0.0000 -0.09007 -0.02797
##         [,9]  [,10]
##  [1,] 0.0000 0.0000
##  [2,] 0.0000 0.0000
##  [3,] 0.0000 0.0000
##  [4,] 0.0000 0.0000
##  [5,] 0.0000 0.0000
##  [6,] 0.0000 0.0000
##  [7,] 0.0000 0.0000
##  [8,] 0.2647 0.0000
##  [9,] 0.3632 0.3499
## [10,] 0.3261 0.7919

(ii) Evaluate the GCV

# using locfit
model = gcvplot(Yf ~ Xf, alpha = 0.5, deg = 1, kern = "bisq", ev = dat(), scale = TRUE)
sort(model$values)[1]

## [1] 2.75

(iii) Estimate the value of Y at each X and also the 95% confidence interval (plot the fit and the confidence interval). Check your results with the output from LOCFIT package - they both should match.

nu1 = sum(diag(L))
nu2 = sum(diag((t(L) %*% L)))
RSS = sum((Yf - yest)^2)
sigmae = RSS/(N - 2 * nu1 + nu2)  #error variance
stderror = sqrt((diag(L %*% t(L)))) * sqrt(sigmae)

par(mfrow = c(1, 2))
cl = qnorm(0.975)  #95% confidence level

## confidence intervals are
lowerc = yest - cl * stderror
upperc = yest + cl * stderror

plot(Xf, Yf, xlab = "X values", ylab = "Y values", main = "output from formula 95% CI interval")
# intervals
lines(lowerc, lty = "dashed", col = "red")
lines(upperc, lty = "dashed", col = "blue")

# Compare with locfit
model = locfit(Yf ~ Xf, alpha = 0.5, deg = 1, kern = "bisq", ev = dat(), scale = TRUE)
zp = predict(model, se.fit = T)

lowerc = zp$fit - cl * zp$se
upperc = zp$fit + cl * zp$se

plot(Xf, Yf, xlab = "X values", ylab = "Y values", main = "output from model 95% CI interval")
# intervals
lines(lowerc, lty = "dashed", col = "red")
lines(upperc, lty = "dashed", col = "blue")

plot of chunk unnamed-chunk-17

(iv) As another check if you fix alpha = 1 and all the weights to be same (you can make them all equal to 1.0), the results from your local polynomial code should be identical to that from linear regression.

# redefine L
N = length(Yf)
L1 = matrix(0, nrow = N, ncol = N)
Xf = as.matrix(Xf)
for (i in 1:N) {
    L1[i, ] = locfit(Yf ~ Xf, alpha = 1, deg = 1, geth = 1, kern = "bisq", scale = TRUE, 
        ev = Xf[i, ])
}
L1

##         [,1]       [,2]      [,3]     [,4]      [,5]      [,6]     [,7]
##  [1,] 0.4857  0.3593134  0.226838  0.10563  0.010528 -0.048269 -0.06738
##  [2,] 0.3511  0.2921097  0.214970  0.13323  0.060062  0.006224 -0.02198
##  [3,] 0.2259  0.2232220  0.197389  0.15561  0.107051  0.061106  0.02561
##  [4,] 0.1146  0.1521096  0.171296  0.16993  0.149953  0.116428  0.07654
##  [5,] 0.0243  0.0768077  0.132313  0.17282  0.187519  0.172817  0.13231
##  [6,] 0.0000  0.0243024  0.076808  0.13231  0.172817  0.187519  0.17282
##  [7,] 0.0000  0.0105953  0.038510  0.07654  0.116428  0.149953  0.16993
##  [8,] 0.0000 -0.0009851  0.005094  0.02561  0.061106  0.107051  0.15561
##  [9,] 0.0000 -0.0109709 -0.024779 -0.02198  0.006224  0.060062  0.13323
## [10,] 0.0000 -0.0199534 -0.052387 -0.06738 -0.048269  0.010528  0.10563
##            [,8]       [,9]  [,10]
##  [1,] -0.052387 -0.0199534 0.0000
##  [2,] -0.024779 -0.0109709 0.0000
##  [3,]  0.005094 -0.0009851 0.0000
##  [4,]  0.038510  0.0105953 0.0000
##  [5,]  0.076808  0.0243024 0.0000
##  [6,]  0.132313  0.0768077 0.0243
##  [7,]  0.171296  0.1521096 0.1146
##  [8,]  0.197389  0.2232220 0.2259
##  [9,]  0.214970  0.2921097 0.3511
## [10,]  0.226838  0.3593134 0.4857


# compare to hat matrix from linear regression
XX = cbind(rep(1, N), X)
# hat= X
hatmat = XX %*% solve(t(XX) %*% XX) %*% t(XX)
hatmat

##           [,1]      [,2]      [,3]    [,4]    [,5]    [,6]    [,7]
##  [1,]  0.34545  0.290909  0.236364 0.18182 0.12727 0.07273 0.01818
##  [2,]  0.29091  0.248485  0.206061 0.16364 0.12121 0.07879 0.03636
##  [3,]  0.23636  0.206061  0.175758 0.14545 0.11515 0.08485 0.05455
##  [4,]  0.18182  0.163636  0.145455 0.12727 0.10909 0.09091 0.07273
##  [5,]  0.12727  0.121212  0.115152 0.10909 0.10303 0.09697 0.09091
##  [6,]  0.07273  0.078788  0.084848 0.09091 0.09697 0.10303 0.10909
##  [7,]  0.01818  0.036364  0.054545 0.07273 0.09091 0.10909 0.12727
##  [8,] -0.03636 -0.006061  0.024242 0.05455 0.08485 0.11515 0.14545
##  [9,] -0.09091 -0.048485 -0.006061 0.03636 0.07879 0.12121 0.16364
## [10,] -0.14545 -0.090909 -0.036364 0.01818 0.07273 0.12727 0.18182
##            [,8]      [,9]    [,10]
##  [1,] -0.036364 -0.090909 -0.14545
##  [2,] -0.006061 -0.048485 -0.09091
##  [3,]  0.024242 -0.006061 -0.03636
##  [4,]  0.054545  0.036364  0.01818
##  [5,]  0.084848  0.078788  0.07273
##  [6,]  0.115152  0.121212  0.12727
##  [7,]  0.145455  0.163636  0.18182
##  [8,]  0.175758  0.206061  0.23636
##  [9,]  0.206061  0.248485  0.29091
## [10,]  0.236364  0.290909  0.34545

(v) Replace the last value 10.8 with a high value to create an outlier, compute the L matrix (with alpha = 0.9) and Hat matrix from linear regression - compare the weights and comment.

Yf[10] = 50
N = length(Yf)
L1 = matrix(0, nrow = N, ncol = N)
Xf = as.matrix(Xf)
for (i in 1:N) {
    L1[i, ] = locfit(Yf ~ Xf, alpha = 0.9, deg = 1, geth = 1, kern = "bisq", 
        scale = TRUE, ev = X[i, ])
}
L1

##          [,1]    [,2]      [,3]     [,4]      [,5]      [,6]     [,7]
##  [1,] 0.52767 0.37256  0.212079  0.07246 -0.025353 -0.069908 -0.06344
##  [2,] 0.36509 0.30124  0.213038  0.12042  0.042349 -0.007111 -0.02277
##  [3,] 0.21549 0.22581  0.205077  0.16187  0.108584  0.058276  0.02154
##  [4,] 0.08621 0.14435  0.183102  0.19278  0.172225  0.127696  0.07171
##  [5,] 0.00000 0.04487  0.131868  0.20604  0.234432  0.206044  0.13187
##  [6,] 0.00000 0.00000  0.044872  0.13187  0.206044  0.234432  0.20604
##  [7,] 0.00000 0.00000  0.021925  0.07171  0.127696  0.172225  0.19278
##  [8,] 0.00000 0.00000  0.003357  0.02154  0.058276  0.108584  0.16187
##  [9,] 0.00000 0.00000 -0.012257 -0.02277 -0.007111  0.042349  0.12042
## [10,] 0.00000 0.00000 -0.026072 -0.06344 -0.069908 -0.025353  0.07246
##            [,8]    [,9]   [,10]
##  [1,] -0.026072 0.00000 0.00000
##  [2,] -0.012257 0.00000 0.00000
##  [3,]  0.003357 0.00000 0.00000
##  [4,]  0.021925 0.00000 0.00000
##  [5,]  0.044872 0.00000 0.00000
##  [6,]  0.131868 0.04487 0.00000
##  [7,]  0.183102 0.14435 0.08621
##  [8,]  0.205077 0.22581 0.21549
##  [9,]  0.213038 0.30124 0.36509
## [10,]  0.212079 0.37256 0.52767


# compare to hat matrix from linear regression
XX = cbind(rep(1, N), X)
# hat= X
hatmat = XX %*% solve(t(XX) %*% XX) %*% t(XX)
hatmat

##           [,1]      [,2]      [,3]    [,4]    [,5]    [,6]    [,7]
##  [1,]  0.34545  0.290909  0.236364 0.18182 0.12727 0.07273 0.01818
##  [2,]  0.29091  0.248485  0.206061 0.16364 0.12121 0.07879 0.03636
##  [3,]  0.23636  0.206061  0.175758 0.14545 0.11515 0.08485 0.05455
##  [4,]  0.18182  0.163636  0.145455 0.12727 0.10909 0.09091 0.07273
##  [5,]  0.12727  0.121212  0.115152 0.10909 0.10303 0.09697 0.09091
##  [6,]  0.07273  0.078788  0.084848 0.09091 0.09697 0.10303 0.10909
##  [7,]  0.01818  0.036364  0.054545 0.07273 0.09091 0.10909 0.12727
##  [8,] -0.03636 -0.006061  0.024242 0.05455 0.08485 0.11515 0.14545
##  [9,] -0.09091 -0.048485 -0.006061 0.03636 0.07879 0.12121 0.16364
## [10,] -0.14545 -0.090909 -0.036364 0.01818 0.07273 0.12727 0.18182
##            [,8]      [,9]    [,10]
##  [1,] -0.036364 -0.090909 -0.14545
##  [2,] -0.006061 -0.048485 -0.09091
##  [3,]  0.024242 -0.006061 -0.03636
##  [4,]  0.054545  0.036364  0.01818
##  [5,]  0.084848  0.078788  0.07273
##  [6,]  0.115152  0.121212  0.12727
##  [7,]  0.145455  0.163636  0.18182
##  [8,]  0.175758  0.206061  0.23636
##  [9,]  0.206061  0.248485  0.29091
## [10,]  0.236364  0.290909  0.34545

Question 4

(a) Fit a best linear regression model

mydata = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_precip.dat"), 
    ncol = 4, byrow = T)
lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]
X = as.matrix(cbind(lat, long, elevation))  #predictor variables

# Note: qguassian is default family Note: thebest is an original function
# that outputs bestalpha based on X, Y, degree and family
mystud1 = thebest(X, precip, 1, "qguassian")
mystud2 = thebest(X, precip, 2, "qguassian")

mystud1$gcv

## [1] 46354

mystud2$gcv

## [1] 46994


# degree 1 is best
bestalpha = mystud1$bestalpha

# Find best predictors
combs = leaps(X, precip)
combos = combs$which
ncombos = length(combos[, 1])
z1 = 1:ncombos

for (i in 1:ncombos) {
    zz = gcvplot(precip ~ X[, combos[i, ]], alpha = bestalpha, deg = 1, kern = "bisq", 
        ev = dat(), scale = TRUE)
    z1[i] = as.vector(zz$values)
}
z2 = order(z1)[1]
bestpredictors = combs$which[z2, ]
bestpredictors

##     1     2     3 
##  TRUE  TRUE FALSE

# the lowest GVC is with lat + long

# Now fit the LOCFIT model
bestmodel = locfit(precip ~ lat + long, alpha = bestalpha, deg = 1, kern = "bisq", 
    scale = TRUE)

# goodness of fit Now fit the LOCFIT model using the best alpha and degree
# obtained from above..  compute RSS0, RSS1 test goodness of fit..
RSS1 = sum(residuals(bestmodel, type = "response")^2)
nu1 = sum(fitted(bestmodel, what = "infl"))  # trace(L)
nu2 = sum(fitted(bestmodel, what = "vari"))  # trace(L^T L)

nu11 = N - 2 * nu1 + nu2

## Linear regression..

zz = lm(precip ~ lat + long)

N = length(precip)
XX = cbind(rep(1, N), X)
# Compute the Hat matrix
hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX)

II = diag(N)
delta0 = t(II - hatm) %*% (II - hatm)  #Equation 9.2
nu00 = sum(diag(delta0))

RSS0 = sum(residuals(zz, type = "response")^2)

Fdata = (RSS0 - RSS1)/(nu00 - nu11)
Fdata = (Fdata/(RSS1/nu11))
Ftheor = qf(0.95, (nu00 - nu11), nu11)  #95% confidence level..
Fdata

## [1] -0.08601

Ftheor

## [1] NaN

## Fdata > Ftheor - reject null - i.e., data fits a local poly model

(b) Perform ANOVA and model diagnostics

par(mfrow = c(2, 2))

error = residuals(bestmodel)
Y_hat = precip - error

hist(error, probability = T)
# fit and plot a normal distribution to the residuals..
lines(error[order(error)], dnorm(error[order(error)], mean = mean(error), sd = sd(error)), 
    col = "red")
# nonparametric PDF
sm.density(error, add = T, col = "blue")

# Homoskedastic: residuals versus y_hat
plot(Y_hat, error)
abline(h = 0, col = "red")
title("Homoskedasticity")

# Check normality of residuals
qqnorm(error, ylab = "Standardized Residuals", xlab = "Normal Scores", main = "Normal QQPlot of Residuals")
qqline(error, col = "red")

# Autocorrelation test
acf(error)

plot of chunk unnamed-chunk-21

c) Compute cross-validated and fitted estimates at each observation points and plot them against the observed values.

n = length(precip)
ycv = 1:n
index = 1:n
X = cbind(lat, long)

for (i in 1:n) {

    # get all the data points by dropping the ith point
    index1 = index[index != i]
    xx = X[index1]
    yy = precip[index1]

    # the dropped point
    xp = X[i]

    zl = locfit(yy ~ xx, alpha = bestalpha, deg = 1, kern = "bisq", scale = TRUE)
    ycv[i] = predict(zl, xp)
}

par(mfrow = c(2, 1))
plot(precip, ycv, ylab = "cross validated estimates")
plot(precip, Y_hat, ylab = "fitted estimates")

plot of chunk unnamed-chunk-22

(d) Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
n = length(precip)
nsim = 500

# initialize vectors
rmse = 1:nsim
rsq = 1:nsim
index = 1:n
N10 = round(0.1 * n)

for (i in 1:nsim) {
    drop = sample(c(1:n), N10)
    keep = setdiff(index, drop)

    xx = long[keep]
    y = precip[keep]
    xx = as.data.frame(xx)
    zz = locfit(y ~ ., data = xx, alpha = bestalpha, deg = 1, kern = "bisq", 
        scale = TRUE)

    xx = long[drop]
    yhat = predict(zz, newdata = as.data.frame(xx))

    resid = precip[drop] - yhat
    rmse[i] = sqrt(mean((precip[drop] - yhat)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - yhat)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse
zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted values\nof the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted values\nof the 10% dropped")

plot of chunk unnamed-chunk-23

(e) Estimate the precipitation and the standard errors on the DEM grid and show them as 3-D plots as in (v) above

bestmodel = locfit(precip ~ lat + long, alpha = bestalpha, deg = 1, kern = "bisq", 
    scale = TRUE)

predpoints = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat"), 
    ncol = 3, byrow = T)
X = as.data.frame(predpoints)
names(X) = c("lat", "long", "elevation")

ypred = predict(bestmodel, newdata = X, type = "response", se.fit = TRUE)

lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))

par(mfrow = c(1, 2))

ypred.mat = matrix(ypred$fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat), main = "Estimated Precipitation")
contour(long1, lat1, t(ypred.mat), add = T)

ypred.mat = matrix(ypred$se.fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat), main = "Estimated Errors")
contour(long1, lat1, t(ypred.mat), add = T)

plot of chunk unnamed-chunk-24

Question 5: Reapeat one using local Polynomial with best family & link function

options for family: “gaussian”, “binomial”, “poisson”, “gamma”, “geom”, “density”, or “q+one of the ones listed” options for link: “ident”,“log”,“logit”,“inverse”,“sqrt”, “arcsin”

(a) Fit a best linear regression model

lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]
X = as.matrix(cbind(lat, long, elevation))  #predictor variables

fam = c("gamma", "qgamma", "guassian", "qguassian")
Nc = length(fam)
mystud1 = 1:Nc
mystud2 = 1:Nc

for (i in 1:Nc) {
    mystud1[i] = list(thebest(X, precip, 1, fam[i]))
    mystud2[i] = list(thebest(X, precip, 2, fam[i]))
}

best.deg1 = order(c(mystud1[[1]]$gcv, mystud1[[2]]$gcv, mystud1[[3]]$gcv, mystud1[[4]]$gcv))
mystud1[[best.deg1[1]]]

## $gcv
## [1] 0.1309
## 
## $bestalpha
## [1] 0.06629
## 
## $family
## [1] "gamma"


best.deg2 = order(c(mystud2[[1]]$gcv, mystud2[[2]]$gcv, mystud2[[3]]$gcv, mystud2[[4]]$gcv))
mystud2[[best.deg2[1]]]

## $gcv
## [1] 0.1339
## 
## $bestalpha
## [1] 0.2285
## 
## $family
## [1] "gamma"


# lowest gcv occurs with deg=1 and family= 'gamma'

bestalpha = mystud1[[best.deg1[1]]]$bestalpha
bestfamily = mystud1[[best.deg1[1]]]$family

# Find best predictors
combs = leaps(X, precip)
combos = combs$which
ncombos = length(combos[, 1])
z1 = 1:ncombos

for (i in 1:ncombos) {
    zz = gcvplot(precip ~ X[, combos[i, ]], alpha = bestalpha, deg = 1, kern = "bisq", 
        ev = dat(), scale = TRUE, family = bestfamily)
    z1[i] = as.vector(zz$values)
}
z2 = order(z1)[1]
bestpredictors = combs$which[z2, ]
bestpredictors

##     1     2     3 
##  TRUE  TRUE FALSE

# the lowest GVC is with lat + long

# find best link
zz = gcvplot(precip ~ lat + long, alpha = bestalpha, deg = 1, kern = "bisq", 
    ev = dat(), scale = TRUE, family = bestfamily, link = "log")
zz1 = gcvplot(precip ~ lat + long, alpha = bestalpha, deg = 1, kern = "bisq", 
    ev = dat(), scale = TRUE, family = bestfamily, link = "inverse")
zz2 = gcvplot(precip ~ lat + long, alpha = bestalpha, deg = 1, kern = "bisq", 
    ev = dat(), scale = TRUE, family = bestfamily, link = "ident")

sort(zz$values)[1]

## [1] 0.1185

sort(zz1$values)[1]

## [1] 0.1181

sort(zz2$values)[1]

## [1] 0.1194

# use link= 'log'

# Now fit the LOCFIT model
bestmodel = locfit(precip ~ lat + long, alpha = bestalpha, deg = 1, kern = "bisq", 
    scale = TRUE, family = bestfamily, link = "log")

# Goodness of Fit
Y = precip
X = as.matrix(cbind(X[, 1:2]))
mymod = locfit(Y ~ X, deg = 2, alpha = bestalpha, kern = "bisq", scale = TRUE, 
    ev = dat(), family = bestfamily)
RSS1 = sum(residuals(mymod)^2)
nu1 = sum(fitted(mymod, what = "infl"))
nu2 = sum(fitted(mymod, what = "vari"))
nu11 = N - 2 * nu1 + nu2

# Compare with gamma glm model using same vars (lon, lat)
zz = glm(Y ~ ., data = as.data.frame(X), family = Gamma(link = "identity"))

XX = as.matrix(cbind(rep(1, N), X))

hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX)

II = diag(N)
delta0 = t(II - hatm) %*% (II - hatm)
nu00 = sum(diag(delta0))

RSS0 = sum(residuals(zz, type = "response")^2)

nu0 = length(zz$coef)  #number of model coefficients
nu00 = N - nu0


Fdata = (RSS0 - RSS1)/(nu00 - nu11)
Fdata = (Fdata/(RSS1/nu11))
Ftheor = qf(0.95, (nu00 - nu11), nu11)  #95% confidence level..

Ftheor

## [1] 1.248

Fdata

## [1] 1810605

## Fdata > Ftheor - reject null - i.e., data fits a local poly model
## better than best glm model

(b) Perform ANOVA and model diagnostics

par(mfrow = c(2, 2))

error = residuals(bestmodel)
Y_hat = precip - error

hist(error, probability = T)
# fit and plot a normal distribution to the residuals..
lines(error[order(error)], dnorm(error[order(error)], mean = mean(error), sd = sd(error)), 
    col = "red")
# nonparametric PDF
sm.density(error, add = T, col = "blue")

# Homoskedastic: residuals versus y_hat
plot(Y_hat, error)
abline(h = 0, col = "red")
title("Homoskedasticity")

# Check normality of residuals
qqnorm(error, ylab = "Standardized Residuals", xlab = "Normal Scores", main = "Normal QQPlot of Residuals")
qqline(error, col = "red")

# Autocorrelation test
acf(error)

plot of chunk unnamed-chunk-26

c) Compute cross-validated and fitted estimates at each observation points and plot them against the observed values.

n = length(precip)
ycv = 1:n
index = 1:n
X = cbind(lat, long)

for (i in 1:n) {

    # get all the data points by dropping the ith point
    index1 = index[index != i]
    xx = X[index1]
    yy = precip[index1]

    # the dropped point
    xp = X[i]

    zl = locfit(yy ~ xx, alpha = bestalpha, deg = 1, kern = "bisq", scale = TRUE, 
        family = bestfamily, link = "log")
    ycv[i] = predict(zl, xp)
}

par(mfrow = c(2, 1))
plot(precip, ycv, ylab = "cross validated estimates")
plot(precip, Y_hat, ylab = "fitted estimates")

plot of chunk unnamed-chunk-27

(d) Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
n = length(precip)
nsim = 500

# initialize vectors
rmse = 1:nsim
rsq = 1:nsim
index = 1:n
N10 = round(0.1 * n)
X = cbind(lat, long)

for (i in 1:nsim) {
    drop = sample(c(1:n), N10)
    keep = setdiff(index, drop)

    xx = X[keep]
    y = precip[keep]
    xx = as.data.frame(xx)
    zz = locfit(y ~ ., data = xx, alpha = bestalpha, deg = 1, kern = "bisq", 
        scale = TRUE, family = bestfamily, link = "log")

    xx = X[drop]
    yhat = predict(zz, newdata = as.data.frame(xx))

    resid = precip[drop] - yhat
    rmse[i] = sqrt(mean((precip[drop] - yhat)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - yhat)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse
zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted values\nof the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted values\nof the 10% dropped")

plot of chunk unnamed-chunk-28

(e) Estimate the precipitation and the standard errors on the DEM grid and show them as 3-D plots as in (v) above

predpoints = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat"), 
    ncol = 3, byrow = T)
X = as.data.frame(predpoints)
names(X) = c("lat", "long", "elevation")

ypred = predict(bestmodel, newdata = X, type = "response", se.fit = TRUE)

lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))

par(mfrow = c(1, 2))

ypred.mat = matrix(ypred$fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat), main = "Estimated Precipitation")
contour(long1, lat1, t(ypred.mat), add = T)

ypred.mat = matrix(ypred$se.fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat), main = "Estimated Errors")
contour(long1, lat1, t(ypred.mat), add = T)

plot of chunk unnamed-chunk-29

Question 6

Estimate the spatial surface using Kriging i. Fit a variogram using co-kriging (i.e., using latitude, longitude and elevation)

lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]
X = as.data.frame(cbind(lat, long, elevation))  #predictor variables

par(mar = c(4, 4, 1, 1))
look <- vgram(X, precip, N = 15, lon.lat = FALSE)
bplot.xy(look$d, look$vgram, breaks = look$breaks, outline = FALSE, xlab = "distance (degrees)", 
    ylab = "variogram")
lines(look$centers, look$stats["mean", ], col = 4)
points(look$centers, look$stats[2, ], col = "blue")

par(mfrox = c(1, 1))

## Warning: "mfrox" is not a graphical parameter


xd = look$centers
ym = look$stats[2, ]
sigma = 1
nls.fit = nls(ym ~ sigma^2 + rho * (1 - exp(-xd/theta)), start = list(rho = max(ym), 
    theta = quantile(xd, 0.1)), control = list(maxiter = 200))
pars = as.data.frame(coefficients(nls.fit))
rho = pars[1, 1]
theta = pars[2, 1]

xr = round(max(xd) + sd(xd))
dgrid <- seq(0, xr, , 400)
lines(dgrid, sigma^2 + rho * (1 - exp(-1 * dgrid/theta)), col = "blue", lwd = 3)

plot of chunk unnamed-chunk-30

ii. Compute cross-validated and fitted estimates at each observation points and plot them against the observed values.

# get the cross validated estimates..
n = length(precip)
ypred = 1:n
index = 1:n

for (i in 1:n) {
    # get all the data points by dropping the ith point
    index1 = index[index != i]

    x1 = lat[index1]
    x2 = long[index1]

    xx = cbind(x1, x2)
    yy = precip[index1]
    zz = elevation[index1]

    z1 = Krig(xx, yy, Z = zz, rho = rho, theta = theta, sigma2 = sigma, m = 2)

    # the dropped point

    zp = elevation[i]
    xp = X[i, 1:2]

    ypred[i] = predict.Krig(z1, x = as.matrix(xp), Z = as.matrix(zp), drop.Z = FALSE)
}

par(mfrow = c(1, 1))
plot(precip, ypred, ylab = "cross validated estimates")

plot of chunk unnamed-chunk-31

iv. Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
n = length(precip)
nsim = 500

# initialize vectors
rmse = 1:nsim
rsq = 1:nsim
index = 1:n
N10 = round(0.1 * n)

for (i in 1:nsim) {
    drop = sample(c(1:n), N10)
    keep = setdiff(index, drop)

    xx1 = lat[keep]
    xx2 = long[keep]

    xx = cbind(xx1, xx2)
    zz = elevation[keep]
    y = precip[keep]

    z1 = Krig(xx, y, Z = zz, rho = rho, theta = theta, sigma2 = sigma, m = 2)

    xp = X[drop, 1:2]
    zp = elevation[drop]

    ypred[i] = predict.Krig(z1, x = as.matrix(xp), Z = as.matrix(zp), drop.Z = FALSE)

    resid = precip[drop] - ypred
    rmse[i] = sqrt(mean((precip[drop] - ypred)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - ypred)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse
zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted values\nof the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted values\nof the 10% dropped")

plot of chunk unnamed-chunk-32

v. Predict on DEM. PLot model estimates and Model errors.

zz = Krig(X[, 1:2], precip, Z = elevation, rho = rho, theta = theta, sigma2 = sigma, 
    m = 2)

predpoints = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat"), 
    ncol = 3, byrow = T)
X = as.data.frame(predpoints)
names(X) = c("lat", "long", "elevation")

### spatial map of estimates and errors - modify to add beautify..
lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))
nrs = length(long1)
ncs = length(lat1)

par(mfrow = c(1, 2))

yp = predict.Krig(zz, x = X[, 1:2], Z = X[, 3], drop.Z = FALSE)
zmat = matrix(yp, nrow = ncs, ncol = nrs)

image.plot(long1, lat1, t(zmat), zlim = range(precip), main = "Estimated Precipitation")
contour(long1, lat1, t(zmat), add = T)

yp.se = predict.se.Krig(zz, x = X[, 1:2], Z = X[, 3], drop.Z = FALSE)
zmat = matrix(yp.se, nrow = ncs, ncol = nrs)

image.plot(long1, lat1, t(zmat), main = "Estimated Error")
contour(long1, lat1, t(zmat), add = T)

plot of chunk unnamed-chunk-33

Question 7

Fit a hierarchal spatial model on the errors of glm fit.

i. Look at variogram and fit model

X = cbind(long, lat, elevation)
zglm = glm(precip ~ ., data = as.data.frame(X), family = Gamma(link = "identity"))
bestmodel = stepAIC(zglm, trace = F)
yres = bestmodel$resid

xs = cbind(long, lat)
nobs = length(yres)

par(mfrow = c(1, 1))

sigma = 1
look <- vgram(xs, yres, N = 15, lon.lat = FALSE)
bplot.xy(look$d, look$vgram, breaks = look$breaks, outline = FALSE, xlab = "distance (degrees)", 
    ylab = "variogram")
points(look$centers, look$stats[2, ], col = "blue")

# fit of exponential by nonlinear least squares
xd <- look$centers
ym <- look$stats[2, ]
sigma <- 1  #
nls.fit <- nls(ym ~ sigma^2 + rho * (1 - exp(-xd/theta)), start = list(rho = max(look$stats[2, 
    ]), theta = quantile(xd, 0.1)), control = list(maxiter = 200))
pars <- coefficients(nls.fit)
rho <- pars[1]
theta <- pars[2]
xr = round(max(xd) + sd(xd))
dgrid <- seq(0, xr, , 400)
lines(dgrid, sigma^2 + rho * (1 - exp(-1 * dgrid/theta)), col = "blue", lwd = 3)

plot of chunk unnamed-chunk-34


zz = Krig(xs, yres, rho = rho, theta = theta, m = 1, sigma2 = sigma)
yp = predict(bestmodel, type = "response") + predict.Krig(zz)

ii. Look at residuals of model

par(mfrow = c(2, 2))

error = precip - yp

hist(error, probability = T)
# fit and plot a normal distribution to the residuals..
lines(error[order(error)], dnorm(error[order(error)], mean = mean(error), sd = sd(error)), 
    col = "red")

## Warning: sd(<matrix>) is deprecated.  Use apply(*, 2, sd) instead.

# nonparametric PDF
sm.density(error, add = T, col = "blue")

# Homoskedastic: residuals versus y_hat
plot(yp, error)
abline(h = 0, col = "red")
title("Homoskedasticity")

# Check normality of residuals
qqnorm(error, ylab = "Standardized Residuals", xlab = "Normal Scores", main = "Normal QQPlot of Residuals")
qqline(error, col = "red")

# Autocorrelation test
acf(error)

plot of chunk unnamed-chunk-35

iii. Cross Validation: drop 1 point, fit model, predict that point

# get the cross validated estimates..
X = as.matrix(long)
X2 = as.matrix(cbind(lat, long))
X3 = as.matrix(cbind(lat, long, elevation))
Y = precip
N = length(Y)
yest = rep(c(1:N))
index = 1:N

for (i in 1:N) {
    index1 = index[-i]
    Xval = X[index1, ]
    Yval = Y[index1]
    XVal2 = X2[index1, ]
    Xval3 = X3[index1, ]

    ### fit GLM and get residuals..
    zglm = glm(Yval ~ ., data = as.data.frame(Xval), family = Gamma(link = "identity"))
    yres = zglm$resid

    zz = Krig(XVal2, yres, rho = rho, theta = theta, m = 1, sigma2 = sigma)

    Xval = X[i]
    yest[i] = predict(zglm, newdata = as.data.frame(Xval), type = "response") + 
        predict.Krig(zz, t(X2[i, ]))
}

par(mfrow = c(1, 1))
plot(Y, yest, xlim = range(Y, yest), ylim = range(Y, yest), xlab = "True value", 
    ylab = "x-validated estimate", main = "Drop 1 Predictions")
lines(Y, Y)

plot of chunk unnamed-chunk-36

iv. Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
N = length(precip)
Np = round(0.1 * N)
nsim = 5
rmse = c()
rsq = c()
index = 1:N

X = as.matrix(long)
X2 = as.matrix(cbind(lat, long))
X3 = as.matrix(cbind(lat, long, elevation))

for (i in 1:nsim) {
    drop = round(runif(Np, 1, N))  # select random 10% of points
    keep = setdiff(index, drop)

    Xdrop = X[drop, ]
    Xdrop2 = X2[drop, ]
    Xdrop3 = X3[drop, ]

    Ykeep = precip[keep]
    Xkeep = X[keep, ]
    Xkeep2 = X2[keep, ]
    Xkeep3 = X3[keep, ]

    zglm = glm(Ykeep ~ ., data = as.data.frame(Xkeep), family = Gamma(link = "identity"))
    yres = zglm$resid

    zz = Krig(Xkeep2, yres, rho = rho, theta = theta, m = 1, sigma2 = sigma)

    Xkeep = Xdrop
    yp = predict(zglm, newdata = as.data.frame(Xkeep), type = "response") + 
        predict.Krig(zz, Xdrop2)

    resid = precip[drop] - yp
    rmse[i] = sqrt(mean((precip[drop] - yp)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - yp)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse
zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted values\nof the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted values\nof the 10% dropped")

plot of chunk unnamed-chunk-37

v. Predict on DEM

lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]

xll = cbind(long, lat)
xe = cbind(long)

### fit GLM and get residuals..
zglm = glm(precip ~ ., data = as.data.frame(xe), family = Gamma(link = "identity"))
yres = zglm$res

zz = Krig(xll, yres, rho = rho, theta = theta, sigma2 = sigma, m = 1)
krigy1 = predict(zz, x = xll)

# now predict on dem values
predpts = read.table("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")

lat = predpts$V1
lon = predpts$V2
elev = predpts$V3
xe = cbind(lon)
xll = cbind(lon, lat)

glm = predict(zglm, newdata = as.data.frame(xe), se.fit = TRUE, type = "response")

krigy = predict.Krig(zz, x = xll)
glmy = glm$fit

par(mfrow = c(2, 1))

yp = glmy + krigy
zmat = matrix(yp, nrow = ncs, ncol = nrs)
image.plot(long1, lat1, t(zmat), zlim = range(precip), main = "Estimated Precipitation")
contour(long1, lat1, t(zmat), add = T)

glmse = glm$se.fit
krigse = predict.se.Krig(zz, x = xll)

yp = glmse + krigse
zmat = matrix(yp, nrow = ncs, ncol = nrs)
image.plot(long1, lat1, t(zmat), zlim = range(precip), main = "Estimated Error")
contour(long1, lat1, t(zmat), add = T)

plot of chunk unnamed-chunk-38

lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]

xll = cbind(long, lat)
xe = cbind(long)

### fit GLM and get residuals..
zglm = glm(precip ~ ., data = as.data.frame(xe), family = Gamma(link = "identity"))
yres = zglm$res

zz = Krig(xll, yres, rho = rho, theta = theta, sigma2 = sigma, m = 1)

# now predict on dem values
predpts = read.table("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
lat = predpts$V1
lon = predpts$V2
elev = predpts$V3
xe = cbind(lon)
xll = cbind(lon, lat)

glmy = predict(zglm, newdata = as.data.frame(xe), se.fit = TRUE)

glmest = glm$fit
krigy = predict(zz, x = xll)

yp = glmest + krigy

krigse = predict.se(zz, x = xll)
glmse = glmy$se.fit

all.se = glmse + krigse

### spatial map of estimates and errors - modify to add beautify..
par(mfrow = c(2, 1))
xlon = sort(unique(lon))
nrs = length(xlon)
ylat = sort(unique(lat))
ncs = length(ylat)
# estimates
zmat = matrix(yp, nrow = ncs, ncol = nrs)
image.plot(xlon, ylat, t(zmat), zlim = range(precip))
contour(xlon, ylat, t(zmat), add = T)
# errors
zmat = matrix(all.se, nrow = ncs, ncol = nrs)
image.plot(xlon, ylat, t(zmat))
contour(xlon, ylat, t(zmat), add = T)

plot of chunk unnamed-chunk-39

Question 8

Daily Precipitation at the 491 locations in Colorado is available for three days January 11, 12 and 13, 1997. Select one of the days (try Jan 11 or 12), convert the daily rainfall at each station into a binary variable of no precipitation (0) and a nonzero precipitation amount (1).

# January 11
mydata1 = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_pcp_daily_1997_01_11.dat"), 
    ncol = 4, byrow = T)
lat = mydata1[, 1]
long = mydata1[, 2]
elevation = mydata1[, 3]
precip = mydata1[, 4]
X = cbind(lat, long, elevation, precip)

# remove all rows where precip=-9999
threshhold = -1
X = subset(X, X[, 4] > threshhold)

# Convert matrix to binary based on values in column 4 (precip)
X[, 4][X[, 4] > 0] <- 1  #in column 4, if the value is greater than 0 in, replace with 1
X[, 4][X[, 4] < 1] <- 0  #in column 4, if the value is less than 1 in, replace with 0
# this is the new matirix
precip = X[, 4]

a) Fit a 'best' GLM with the appropriate link function using one of the objective functions.

# using AIC

X = as.data.frame(X[, 1:3])
zz = glm(precip ~ ., data = X, family = binomial(link = "probit"))
bestmodel = stepAIC(zz)

## Start:  AIC=343.3
## precip ~ lat + long + elevation
## 
##             Df Deviance AIC
## <none>              335 343
## - long       1      340 346
## - elevation  1      347 353
## - lat        1      356 362


X = as.data.frame(X[, 1:3])
zz = glm(precip ~ ., data = X, family = binomial(link = "cloglog"))
bestmodel = stepAIC(zz)

## Start:  AIC=344.5
## precip ~ lat + long + elevation
## 
##             Df Deviance AIC
## <none>              336 344
## - long       1      339 345
## - elevation  1      348 354
## - lat        1      357 363


X = as.data.frame(X[, 1:3])
zz = glm(precip ~ ., data = X, family = binomial(link = "logit"))
bestmodel = stepAIC(zz)

## Start:  AIC=342.9
## precip ~ lat + long + elevation
## 
##             Df Deviance AIC
## <none>              335 343
## - long       1      340 346
## - elevation  1      347 353
## - lat        1      356 362


# AIC shows that 'logit' has the lowest AIC. AIC also shows that all
# varibles are good predictors

b) Estimate the function on the DEM grade and plot the surface.Compare them with the surface plot of the elevation and also with the results from Slater and Clark (2006) Figure 4. They use a quasi-local GLM approach but I would imagine the results should be similar.

predpoints = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat"), 
    ncol = 3, byrow = T)
X = as.data.frame(predpoints)
names(X) = c("lat", "long", "elevation")

lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))
ypred = predict.glm(bestmodel, newdata = X, type = "response", se.fit = TRUE)

par(mfrow = c(1, 1))

ypred.mat = matrix(ypred$fit, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat))
contour(long1, lat1, t(ypred.mat), add = T)

plot of chunk unnamed-chunk-42

Question 9

Repeat 8 using local glm.

# January 11
mydata1 = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_pcp_daily_1997_01_11.dat"), 
    ncol = 4, byrow = T)
lat = mydata1[, 1]
long = mydata1[, 2]
elevation = mydata1[, 3]
precip = mydata1[, 4]
X = cbind(lat, long, elevation, precip)

# remove all rows where precip=-9999
threshhold = -1
X = subset(X, X[, 4] > threshhold)

# Convert matrix to binary based on values in column 4 (precip)
X[, 4][X[, 4] > 0] <- 1  #in column 4, if the value is greater than 0 in, replace with 1
X[, 4][X[, 4] < 1] <- 0  #in column 4, if the value is less than 1 in, replace with 0
# this is the new matirix
precip = X[, 4]
lat = X[, 1]
long = X[, 2]
elevation = X[, 3]
X = cbind(lat, long, elevation)

a) Fit a 'best' local GLM with the appropriate link function using one of the objective functions.

mystud1 = thebest(X, precip, 1, "binomial")
mystud2 = thebest(X, precip, 2, "binomial")

mystud1$gcv

## [1] 0.1225

mystud2$gcv

## [1] 0.01567


# degree 2 is best
bestalpha = mystud2$bestalpha

# Find best predictors
combs = leaps(X, precip)
combos = combs$which
ncombos = length(combos[, 1])
z1 = 1:ncombos

for (i in 1:ncombos) {
    zz = gcvplot(precip ~ X[, combos[i, ]], alpha = bestalpha, deg = 2, kern = "bisq", 
        ev = dat(), scale = TRUE, family = "binomial")
    z1[i] = as.vector(zz$values)
}
z2 = order(z1)[1]
bestpredictors = combs$which[z2, ]
bestpredictors

##    1    2    3 
## TRUE TRUE TRUE


# Now find best link.
z3 = gcvplot(precip ~ X, alpha = bestalpha, deg = 2, kern = "bisq", ev = dat(), 
    scale = TRUE, family = "binomial", link = "logit")
z5 = gcvplot(precip ~ X, alpha = bestalpha, deg = 2, kern = "bisq", ev = dat(), 
    scale = TRUE, family = "binomial", link = "ident")

# the lowest gcv for each link
sort(z3$values)[1]

## [1] 0.01567

sort(z5$values)[1]

## [1] -0.9508

# use link='logit'

bestmodel = locfit(precip ~ lat + long + elevation, alpha = bestalpha, deg = 2, 
    kern = "bisq", scale = TRUE, family = "binomial", link = "logit")


predpoints = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat"), 
    ncol = 3, byrow = T)
X = as.data.frame(predpoints)
names(X) = c("lat", "long", "elevation")

ypred = predict(bestmodel, newdata = as.data.frame(X))

par(mfrow = c(1, 1))

lat1 = sort(unique(X[, 1]))
long1 = sort(unique(X[, 2]))

ypred.mat = matrix(ypred, nrow = length(lat1), ncol = length(long1))
image.plot(long1, lat1, t(ypred.mat))
contour(long1, lat1, t(ypred.mat), add = T)

plot of chunk unnamed-chunk-45

Question 10

Fit a hierarchal spatial model with Local Polynomial

i. Look at variogram and fit model

mydata = matrix(scan("http://cires.colorado.edu/~aslater/CVEN_6833/colo_precip.dat"), 
    ncol = 4, byrow = T)
lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]
X = cbind(lat, long, elevation)

# from question 5
bestalpha = 0.06629328
bestfamily = "gamma"
zloc = locfit(precip ~ lat + long + elevation, alpha = bestalpha, deg = 2, kern = "bisq", 
    scale = TRUE, family = bestfamily, link = "log")
yres = residuals(zloc, type = "response")

xs = cbind(long, lat)

par(mfrow = c(1, 1))

sigma = 1
look <- vgram(xs, yres, N = 15, lon.lat = FALSE)
bplot.xy(look$d, look$vgram, breaks = look$breaks, outline = FALSE, xlab = "distance (degrees)", 
    ylab = "variogram")
points(look$centers, look$stats[2, ], col = "blue")

xd <- look$centers
ym <- look$stats[2, ]

rho = 20000
theta = 0.1

xr = round(max(xd) + sd(xd))
dgrid <- seq(0, xr, , 400)
lines(dgrid, sigma^2 + rho * (1 - exp(-1 * dgrid/theta)), col = "blue", lwd = 3)

plot of chunk unnamed-chunk-46


zz = Krig(xs, yres, rho = rho, theta = theta, m = 1, sigma2 = sigma)
ypred = predict(zloc, X, se.fit = TRUE)
yp = ypred$fit + predict.Krig(zz)

ii. Look at residuals of model

par(mfrow = c(2, 2))

error = precip - yp

hist(error, probability = T)
# fit and plot a normal distribution to the residuals..
lines(error[order(error)], dnorm(error[order(error)], mean = mean(error), sd = sd(error)), 
    col = "red")

## Warning: sd(<matrix>) is deprecated.  Use apply(*, 2, sd) instead.

# nonparametric PDF
sm.density(error, add = T, col = "blue")

# Homoskedastic: residuals versus y_hat
plot(yp, error)
abline(h = 0, col = "red")
title("Homoskedasticity")

# Check normality of residuals
qqnorm(error, ylab = "Standardized Residuals", xlab = "Normal Scores", main = "Normal QQPlot of Residuals")
qqline(error, col = "red")

# Autocorrelation test
acf(error)

plot of chunk unnamed-chunk-47

iii. Cross Validation: drop 1 point, fit model, predict that point

# get the cross validated estimates..
X2 = as.matrix(cbind(lat, long))
X3 = as.matrix(cbind(lat, long, elevation))
N = length(precip)
yest = rep(c(1:N))
index = 1:N

for (i in 1:N) {
    index1 = index[-i]
    YVal = precip[index1]
    XVal2 = X2[index1, ]
    XVal3 = X3[index1, ]

    ### fit GLM and get residuals..
    zloc = locfit(YVal ~ XVal3, alpha = bestalpha, deg = 2, kern = "bisq", scale = TRUE, 
        family = bestfamily, link = "log")
    yres = residuals(zloc, type = "response")

    zz = Krig(XVal2, yres, rho = rho, theta = theta, m = 1, sigma2 = sigma)

    XVal = X[i]
    yest[i] = predict(zglm, newdata = as.data.frame(XVal)) + predict.Krig(zz, 
        t(X2[i, ]))
}

par(mfrow = c(1, 1))
plot(Y, yest, xlim = range(Y, yest), ylim = range(Y, yest), xlab = "True value", 
    ylab = "x-validated estimate", main = "Drop 1 Predictions")
lines(Y, Y)

plot of chunk unnamed-chunk-48

iv. Drop 10% of observations, fit the model (i.e., the 'best' model from i. above) to the rest of the data and predict the dropped points. Compute RMSE and R2 and show them as boxplots.

### Drop 10% of points, fit the model and predict the dropped points..
N = length(precip)
Np = round(0.1 * N)
nsim = 5
rmse = c()
rsq = c()
index = 1:N

X = as.matrix(long)
X2 = as.matrix(cbind(lat, long))
X3 = as.matrix(cbind(lat, long, elevation))

for (i in 1:nsim) {
    drop = round(runif(Np, 1, N))  # select random 10% of points
    keep = setdiff(index, drop)

    Xdrop = X[drop, ]
    Xdrop2 = X2[drop, ]
    Xdrop3 = X3[drop, ]

    Ykeep = precip[keep]
    Xkeep = X[keep, ]
    Xkeep2 = X2[keep, ]
    Xkeep3 = X3[keep, ]

    zloc = locfit(Ykeep ~ Xkeep3, alpha = bestalpha, deg = 2, kern = "bisq", 
        scale = TRUE, family = bestfamily, link = "log")
    yres = residuals(zloc, type = "response")

    zz = Krig(Xkeep2, yres, rho = rho, theta = theta, m = 1, sigma2 = sigma)

    Xkeep3 = Xdrop3
    yp = predict(zloc, newdata = as.data.frame(Xkeep3)) + predict.Krig(zz, Xdrop2)

    resid = precip[drop] - yp
    rmse[i] = sqrt(mean((precip[drop] - yp)^2))
    SST = sum((precip[drop] - mean(precip[drop]))^2)
    SSR = sum((precip[drop] - yp)^2)
    rsq[i] = 1 - (SSR/SST)
}

par(mfrow = c(1, 2))

# boxplot rmse
zz = myboxplot(rmse, xlab = "", ylab = "Rmse", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rmse), col = "red", cex = 1.25, pch = 16)
title(main = "RMSE: predicted values\nof the 10% dropped")

# boxplot rsq
zz = myboxplot(rsq, xlab = "", ylab = "R-Squared", plot = F)
z1 = bxp(zz, xlab = "", ylab = "Median", cex = 1.25)
points(z1, mean(rsq), col = "red", cex = 1.25, pch = 16)
title(main = "R-Squared: predicted values\nof the 10% dropped")

plot of chunk unnamed-chunk-49

v. Predict on DEM

lat = mydata[, 1]
long = mydata[, 2]
elevation = mydata[, 3]
precip = mydata[, 4]

X = cbind(lat, long, elevation)
X2 = cbind(long, lat)

### fit locfit and get residuals..
zloc = locfit(precip ~ X, alpha = bestalpha, deg = 2, kern = "bisq", scale = TRUE, 
    family = bestfamily, link = "log")
yres = residuals(zloc, type = "response")

zz = Krig(X2, yres, rho = rho, theta = theta, sigma2 = sigma, m = 1)

# now predict on dem values
predpts = read.table("http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat")
lat = predpts$V1
lon = predpts$V2
elev = predpts$V3
X = cbind(lat, lon, elev)
X2 = cbind(lon, lat)

loc = predict(zloc, newdata = as.data.frame(X), se.fit = TRUE)

locfit = loc$fit
krigy = predict(zz, x = X2)

yp = locfit + krigy

locse = loc$se.fit
krigse = predict.se(zz, x = xll)

all.se = locse + krigse

### spatial map of estimates and errors - modify to add beautify..
par(mfrow = c(2, 1))
xlon = sort(unique(lon))
nrs = length(xlon)
ylat = sort(unique(lat))
ncs = length(ylat)
# estimates
zmat = matrix(yp, nrow = ncs, ncol = nrs)
image.plot(xlon, ylat, t(zmat), zlim = range(precip))
contour(xlon, ylat, t(zmat), add = T)
# errors
zmat = matrix(all.se, nrow = ncs, ncol = nrs)
image.plot(xlon, ylat, t(zmat))
contour(xlon, ylat, t(zmat), add = T)

plot of chunk unnamed-chunk-50

Last Question. Summarize all the techniques.

Models Explored:

Linear Model (Normal)
- Global fit
- Assumes precip is normal
- Major correlation on errors- indicating errors have a spatial relationship
- Further errors are not normal (bad looking QQ plot) and are heteroskedastic.
- Longitude is best predictor
Generalized Linear Model
- Also a global fit
- Precip is of a gamma distribution instead of normal
- Still major correlation on errors- indicating errors have a spatial relationship
- Further errors are not normal (bad looking QQ plot) and are heteroskedastic.
- Longitude is best predictor
- Not much different to linear model
Local Polynomial
- Local fit
- Assumes precip is of a guassian distribution
- Improvements on QQ plot and homoskedasticity of errors
- Overall, Significantly better than lm and glm
- Still some remaining correlation on errors- indicating errors may have a spatial relationship not captured by this model
- Longitude and Latitude are best predictors
Local Polynomial with distribution defined
- Local fit
- Assume precip is of a gamma distribution
- Not too much different from the local polynomial model with guassian distribtion- perhaps this model is slightly better
- Again, still some correlation on errors indicating errors have a spatial relationship not captured by this model
- Longitude and latitude still best predictors
Co-Kriging: Spatial Model on dependent variable
- Global fit that emphasises high correlation for short distances and low correlation for long distances
- aka Kriging
- Error diagnositics are significantly better than lm, glm, local polynomial and local polynomial (gamma)
- Autocorrelation is low
- QQ Plot and Homoskedasticity look good
- Longitude, latitude and elevation all best predictors
Hierarchial Kriging
- 1st model: Global fit and/or Local fit on dependent variable
- 2nd model: Kriging on residuals (order=1)
- This accounts for the autocorrelated residuals from the first model by applying a spatial model on them
- Error diagnositics are significantly better than lm, glm, local polynomial, local polynomial (gamma) and regular spatial model
- Autocorrelation is low
- QQ Plot and Homoskedasticity look good
- Longitude, latitude and elevation all best predictors
- Should yeild best/ most accurate plots