Emily Gill - CVEN6833 - Homework #3

December 16, 2013

Question 1: CART & Forecasting

Read in Colorado River flow data from Lees Ferry, AZ.

rm(list = ls())
setwd("~/Documents/University of Colorado/Fall 2013/CVEN6833/HW3")

library(tree)
source("bestTREE.r")

# read in the four datasets
LFann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/LF_1906-2005.txt")
ENSOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/nino3_1905-2007.txt")
PDOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/PDO_1900-2011.txt")
AMOann <- read.table("http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/AMO_1856-2011.txt")

# select the years that match for the period of 1906 to 2005
years <- seq(1906, 2005, 1)
LFdata <- as.data.frame(cbind(LFann[, 1], LFann[, 2]/(10^6), ENSOann[match(years, 
    ENSOann[, 1]), 2], PDOann[match(years, PDOann[, 1]), 2], AMOann[match(years, 
    AMOann[, 1]), 2]))
names(LFdata) <- c("year", "LF", "ENSO", "PDO", "AMO")

Part (i)

Fit a best regression tree for the entire period - using CV and Deviance and display the tree.

# wrote my own function to look at the deviance across using different
# predictors and choose the least deviance
data <- as.matrix(LFdata[, 2:5])
ztree <- bestTREE(data)
ztree

##      deviances          models
## [1,] "1026.3212702031"  "full"
## [2,] "1340.38785429764" "2,3" 
## [3,] "1132.76454168481" "3,4" 
## [4,] "1080.37609881298" "2,4" 
## [5,] "1387.1050405729"  "2"   
## [6,] "1267.77028543124" "3"   
## [7,] "1374.40693166927" "4"


# from the function we see that all three provide the lowest deviance so we
# will continue with that and prune this tree
ztree <- tree(LF ~ AMO + PDO + ENSO, LFdata)
plot(ztree)
text(ztree, cex = 0.5, digits = 3)

plot of chunk unnamed-chunk-2


# Can also try prune.tree (default method='deviance')
tree.p <- prune.tree(ztree)
plot(tree.p)

plot of chunk unnamed-chunk-2

tree.p$dev

##  [1] 1026 1048 1106 1168 1202 1255 1315 1382 1535 1668 1940

opt.trees <- which(tree.p$dev == min(tree.p$dev))
min(tree.p$size[opt.trees])

## [1] 14

tree.p <- prune.tree(ztree, best = 14)
plot(tree.p)
text(tree.p, cex = 0.5, digits = 3)

plot of chunk unnamed-chunk-2

We can also look at CV

# Can also try cv.tree
tree.cv <- cv.tree(ztree)
plot(tree.cv)

plot of chunk unnamed-chunk-3

Part (ii)

Evaluate the performance of the regression tree on three different epochs: 1906-1921, 1975-1990, and 1990-2005. Fit a best tree on the data outside these epochs and predict the mean flow for these epochs. Plot the observed and predicted values. Compute R2 and RMSE.

# Find the indices for each epoch
i1 <- match(seq(1906, 1921, 1), LFdata[, 1])
i2 <- match(seq(1975, 1990, 1), LFdata[, 1])
i3 <- match(seq(1990, 2005, 1), LFdata[, 1])
is <- cbind(i1, i2, i3)

# Drop one epoch, fit tree, predict dropped, compute RMSE and R2
RMSE <- 1:3
RSQ <- 1:3
par(mfrow = c(3, 1))
for (i in 1:3) {
    drop <- is[, i]
    xdata <- LFdata[-drop, ]
    fit <- tree(LF ~ ENSO + PDO + AMO, xdata)
    pred <- predict(fit, newdata = LFdata[drop, ])
    plot(LFdata$LF[drop], pred, xlim = c(min(LFdata$LF[drop]), max(LFdata$LF[drop])), 
        ylim = c(min(LFdata$LF[drop]), max(LFdata$LF[drop])))
    lines(LFdata$LF[drop], LFdata$LF[drop], col = "red")
    # RMSE[i] <- cor(LFdata[drop,2],pred) RSQ[i] <- RMSE[i]^2

    n <- length(pred)
    RMSE[i] <- sum((LFdata[drop, 2] - pred)^2)/n
    RSQ[i] <- cor(LFdata[drop, 2], pred)^2

}

plot of chunk unnamed-chunk-4


RMSE

## [1] 23.00 29.21 27.88

RSQ

## [1] 0.009195 0.155395 0.017276