Read in Colorado River flow data from Lees Ferry, AZ.
rm(list = ls())
setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW3")
library(tree)
source("bestTREE.r")
# read in the four datasets
LFann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/LF_1906-2005.txt")
ENSOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/nino3_1905-2007.txt")
PDOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/PDO_1900-2011.txt")
AMOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/AMO_1856-2011.txt")
# select the years that match for the period of 1906 to 2005
years <- seq(1906, 2005, 1)
LFdata <- as.data.frame(cbind(LFann[, 1], LFann[, 2]/(10^6), ENSOann[match(years,
ENSOann[, 1]), 2], PDOann[match(years, PDOann[, 1]), 2], AMOann[match(years,
AMOann[, 1]), 2]))
names(LFdata) <- c("year", "LF", "ENSO", "PDO", "AMO")
Fit a best regression tree for the entire period - using CV and Deviance and display the tree.
# wrote my own function to look at the deviance across using different
# predictors and choose the least deviance
data <- as.matrix(LFdata[, 2:5])
ztree <- bestTREE(data)
ztree
## deviances models
## [1,] "1026.3212702031" "full"
## [2,] "1340.38785429764" "2,3"
## [3,] "1132.76454168481" "3,4"
## [4,] "1080.37609881298" "2,4"
## [5,] "1387.1050405729" "2"
## [6,] "1267.77028543124" "3"
## [7,] "1374.40693166927" "4"
# from the function we see that all three provide the lowest deviance so we
# will continue with that and prune this tree
ztree <- tree(LF ~ AMO + PDO + ENSO, LFdata)
plot(ztree)
text(ztree, cex = 0.5, digits = 3)
# Can also try prune.tree (default method='deviance')
tree.p <- prune.tree(ztree)
plot(tree.p)
tree.p$dev
## [1] 1026 1048 1106 1168 1202 1255 1315 1382 1535 1668 1940
opt.trees <- which(tree.p$dev == min(tree.p$dev))
min(tree.p$size[opt.trees])
## [1] 14
tree.p <- prune.tree(ztree, best = 14)
plot(tree.p)
text(tree.p, cex = 0.5, digits = 3)
We can also look at CV
# Can also try cv.tree
tree.cv <- cv.tree(ztree)
plot(tree.cv)
Evaluate the performance of the regression tree on three different epochs: 1906-1921, 1975-1990, and 1990-2005. Fit a best tree on the data outside these epochs and predict the mean flow for these epochs. Plot the observed and predicted values. Compute R2 and RMSE.
# Find the indices for each epoch
i1 <- match(seq(1906, 1921, 1), LFdata[, 1])
i2 <- match(seq(1975, 1990, 1), LFdata[, 1])
i3 <- match(seq(1990, 2005, 1), LFdata[, 1])
is <- cbind(i1, i2, i3)
# Drop one epoch, fit tree, predict dropped, compute RMSE and R2
RMSE <- 1:3
RSQ <- 1:3
par(mfrow = c(3, 1))
for (i in 1:3) {
drop <- is[, i]
xdata <- LFdata[-drop, ]
fit <- tree(LF ~ ENSO + PDO + AMO, xdata)
pred <- predict(fit, newdata = LFdata[drop, ])
plot(LFdata$LF[drop], pred, xlim = c(min(LFdata$LF[drop]), max(LFdata$LF[drop])),
ylim = c(min(LFdata$LF[drop]), max(LFdata$LF[drop])))
lines(LFdata$LF[drop], LFdata$LF[drop], col = "red")
# RMSE[i] <- cor(LFdata[drop,2],pred) RSQ[i] <- RMSE[i]^2
n <- length(pred)
RMSE[i] <- sum((LFdata[drop, 2] - pred)^2)/n
RSQ[i] <- cor(LFdata[drop, 2], pred)^2
}
RMSE
## [1] 23.00 29.21 27.88
RSQ
## [1] 0.009195 0.155395 0.017276