# HW 3 Q 1 JLCY, 19 Dec 2013
############################################################
# part i
# use CART finding best regression tree for Lees Ferry with AMO, ENSO & PDO
# read Less Ferry data online
# LFdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/LeesFerry-monflows-1906-2006.txt'),
# ncol=13, byrow=T)
# read from pc
LFdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\LeesFerry-monflows-1906-2006.txt"),
ncol = 13, byrow = T)
# read AMO data online
# AMOdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/AMO_1856-2011.txt'),
# ncol=2, byrow=T)
# read from pc
AMOdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\AMO_1856-2011.txt"),
ncol = 2, byrow = T)
# read ENSO data online
# ENSOdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/nino3_1905-2007.txt'),
# ncol=2, byrow=T)
# read from pc
ENSOdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\nino3_1905-2007.txt"),
ncol = 2, byrow = T)
# read PDO data online
# PDOdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/PDO_1900-2011.txt'),
# ncol=2, byrow=T)
# read from pc
PDOdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\PDO_1900-2011.txt"),
ncol = 2, byrow = T)
############################################################
# extract data from 1906 to 2006
ntime = 101 # 1906-2006
# LF value - annual sum
LFvalue = LFdata[, 2:13] # all month data
LFvalue = apply(LFvalue, 1, sum) # row sum
# Divide by 10^6 to get them into Million Acre Feet (MaF)
LFvalue = LFvalue/1e+06
# AMO value
i = 1
for (i in 1:length(AMOdata[, 1])) {
if (AMOdata[i, 1] == 1906) {
# starts at 1906
AMOvalue = AMOdata[i:(ntime + i - 1), 2] # ends at 2006
}
}
# ENSO value
i = 1
for (i in 1:length(ENSOdata[, 1])) {
if (ENSOdata[i, 1] == 1906) {
ENSOvalue = ENSOdata[i:(ntime + i - 1), 2]
}
}
# PDO value
i = 1
for (i in 1:length(PDOdata[, 1])) {
if (PDOdata[i, 1] == 1906) {
PDOvalue = PDOdata[i:(ntime + i - 1), 2]
}
}
############################################################
# fitting using package 'rpart'
library(rpart)
fit <- rpart(LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, method = "anova", data = data.frame(LFvalue))
printcp(fit) # display the results
##
## Regression tree:
## rpart(formula = LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, data = data.frame(LFvalue),
## method = "anova")
##
## Variables actually used in tree construction:
## [1] AMOvalue ENSOvalue
##
## Root node error: 1947/101 = 19
##
## n= 101
##
## CP nsplit rel error xerror xstd
## 1 0.142 0 1.00 1.03 0.12
## 2 0.069 1 0.86 0.94 0.11
## 3 0.039 2 0.79 0.99 0.12
## 4 0.028 4 0.71 1.00 0.12
## 5 0.027 5 0.68 1.02 0.13
## 6 0.015 6 0.66 1.03 0.13
## 7 0.010 7 0.64 1.06 0.13
plotcp(fit) # visualize cross-validation results
par(ask = TRUE)
# summary(fit) # detailed summary of splits
# create additional plots
par(mfrow = c(1, 2)) # two plots on one page
rsq.rpart(fit) # visualize cross-validation results \t
##
## Regression tree:
## rpart(formula = LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, data = data.frame(LFvalue),
## method = "anova")
##
## Variables actually used in tree construction:
## [1] AMOvalue ENSOvalue
##
## Root node error: 1947/101 = 19
##
## n= 101
##
## CP nsplit rel error xerror xstd
## 1 0.142 0 1.00 1.03 0.12
## 2 0.069 1 0.86 0.94 0.11
## 3 0.039 2 0.79 0.99 0.12
## 4 0.028 4 0.71 1.00 0.12
## 5 0.027 5 0.68 1.02 0.13
## 6 0.015 6 0.66 1.03 0.13
## 7 0.010 7 0.64 1.06 0.13
par(ask = TRUE)
par(mfrow = c(1, 1)) # 1 plot on one page
# plot tree
plot(fit, uniform = TRUE, main = "Regression Tree for LF flow using 'rpart'")
text(fit, use.n = TRUE, all = TRUE, cex = 0.9)
par(ask = TRUE)
############################################################
# find cp value that mimiizes CV error
min_cp = fit$cptable[which.min(fit$cptable[, "xerror"]), "CP"]
# prune the tree
pfit = prune(fit, cp = min_cp) # from cptable
# plot the pruned tree
plot(pfit, uniform = TRUE, main = "Pruned Regression Tree for LF flow")
text(pfit, use.n = TRUE, all = TRUE, cex = 0.8)
par(ask = TRUE)
############################################################
# fitting using package 'tree'
library(tree)
## Warning: package 'tree' was built under R version 3.0.2
# function fit_tree input different indices to find total residual deviance
# fitting input: combinations of AMOvalue, ENSOvalue, PDOvalue
fit_tree = function(LFvalue, dframe) {
dframe = data.frame(dframe)
# fit tree
fit_tree = tree(LFvalue ~ ., data = dframe)
# total residual deviance
trdev = sum(sapply(resid(fit_tree), function(x) (x - mean(resid(fit_tree)))^2))
# return
result = list(fit_tree = fit_tree, trdev = trdev)
return(result)
}
############################################################
# input different combinations of indices
# only AMO
dframe = cbind(AMOvalue)
fit_tree1 = fit_tree(LFvalue, dframe)
############################################################
# only ENSO
dframe = cbind(ENSOvalue)
fit_tree2 = fit_tree(LFvalue, dframe)
############################################################
# only PDO
dframe = cbind(PDOvalue)
fit_tree3 = fit_tree(LFvalue, dframe)
############################################################
# only AMO & ENSO
dframe = cbind(AMOvalue, ENSOvalue)
fit_tree4 = fit_tree(LFvalue, dframe)
############################################################
# only AMO & PDO
dframe = cbind(AMOvalue, PDOvalue)
fit_tree5 = fit_tree(LFvalue, dframe)
############################################################
# only ENSO & PDO
dframe = cbind(ENSOvalue, PDOvalue)
fit_tree6 = fit_tree(LFvalue, dframe)
############################################################
# All 3: AMO ENSO & PDO
dframe = cbind(AMOvalue, ENSOvalue, PDOvalue)
fit_tree7 = fit_tree(LFvalue, dframe)
############################################################
# find tree with least deviance and set to best_fit_tree
# group deviances
list_tree_dev = cbind(fit_tree1$trdev, fit_tree2$trdev, fit_tree3$trdev, fit_tree4$trdev,
fit_tree5$trdev, fit_tree6$trdev, fit_tree7$trdev)
min_list_tree_dev = min(list_tree_dev) # min of list
where_min_list_tree_dev = match(min_list_tree_dev, list_tree_dev) # position of min dev
# find out best_fit_tree with lowest deviance
if (where_min_list_tree_dev == 1) best_fit_tree = fit_tree1$fit_tree
if (where_min_list_tree_dev == 2) best_fit_tree = fit_tree2$fit_tree
if (where_min_list_tree_dev == 3) best_fit_tree = fit_tree3$fit_tree
if (where_min_list_tree_dev == 4) best_fit_tree = fit_tree4$fit_tree
if (where_min_list_tree_dev == 5) best_fit_tree = fit_tree5$fit_tree
if (where_min_list_tree_dev == 6) best_fit_tree = fit_tree6$fit_tree
if (where_min_list_tree_dev == 7) best_fit_tree = fit_tree7$fit_tree
############################################################
# plot tree
plot(best_fit_tree)
text(best_fit_tree, cex = 0.8)
par(ask = TRUE)
############################################################
# pruning - evaluating error on training data
# Sequence of pruned tree sizes/errors
best_fit_tree.seq = prune.tree(best_fit_tree)
# Plots size vs. error
plot(best_fit_tree.seq)
# Vector of error rates for prunings, in order
best_fit_tree.seq$dev
## [1] 959.0 980.3 1038.9 1101.2 1134.8 1187.5 1247.3 1384.2 1537.3 1670.8
## [11] 1947.0
par(ask = TRUE)
# Positions of optimal (with respect to error) trees
opt.trees = which(best_fit_tree.seq$dev == min(best_fit_tree.seq$dev))
# Size of smallest optimal tree
optimal_size = min(best_fit_tree.seq$size[opt.trees]) # = 15
cat("Size of smallest optimal tree = ", optimal_size, "\n", "\n")
## Size of smallest optimal tree = 15
##
############################################################
# CV
# fit tree
best_fit_tree = tree(LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, data = data.frame(LFvalue))
best_fit_tree_cv = cv.tree(best_fit_tree)
plot(best_fit_tree_cv)
par(ask = TRUE)
############################################################
# part ii
# separate different data subsets
# 1906 - 1921: predict
LFvalue_06_21 = LFvalue[1:16]
AMOvalue_06_21 = AMOvalue[1:16]
ENSOvalue_06_21 = ENSOvalue[1:16]
PDOvalue_06_21 = PDOvalue[1:16]
# 1922 - 2006: fit
LFvalue_22_06 = LFvalue[17:101]
AMOvalue_22_06 = AMOvalue[17:101]
ENSOvalue_22_06 = ENSOvalue[17:101]
PDOvalue_22_06 = PDOvalue[17:101]
############################################################
# 1975 - 1990: predict
LFvalue_75_90 = LFvalue[70:85]
AMOvalue_75_90 = AMOvalue[70:85]
ENSOvalue_75_90 = ENSOvalue[70:85]
PDOvalue_75_90 = PDOvalue[70:85]
# 1906 - 1974 & 1991 - 2006: fit
LFvalue_06_74_91_06 = append(LFvalue[1:69], LFvalue[86:101])
AMOvalue_06_74_91_06 = append(AMOvalue[1:69], AMOvalue[86:101])
ENSOvalue_06_74_91_06 = append(ENSOvalue[1:69], ENSOvalue[86:101])
PDOvalue_06_74_91_06 = append(PDOvalue[1:69], PDOvalue[86:101])
############################################################
# 1990 - 2006: predict
LFvalue_90_06 = LFvalue[85:101]
AMOvalue_90_06 = AMOvalue[85:101]
ENSOvalue_90_06 = ENSOvalue[85:101]
PDOvalue_90_06 = PDOvalue[85:101]
# 1906 - 1989: fit
LFvalue_06_89 = LFvalue[1:84]
AMOvalue_06_89 = AMOvalue[1:84]
ENSOvalue_06_89 = ENSOvalue[1:84]
PDOvalue_06_89 = PDOvalue[1:84]
############################################################
# function best_fit_tree_epoch predict LF flow of epoch from regression tree
# of outside epoch fitting input: LF, AMO, ENSO, PDO predicting input: LF,
# AMO, ENSO, PDO
best_fit_tree_epoch = function(LF_fit_input, AMO_fit_input, ENSO_fit_input,
PDO_fit_input, LF_pred_input, AMO_pred_input, ENSO_pred_input, PDO_pred_input) {
# fitting input AMO, ENSO & PDO
AMO = AMO_fit_input
ENSO = ENSO_fit_input
PDO = PDO_fit_input
climate_index = cbind(AMO, ENSO, PDO)
climate_index = as.data.frame(climate_index)
# fit tree
best_fit_tree = tree(LF_fit_input ~ ., climate_index)
# plot tree
plot(best_fit_tree)
text(best_fit_tree, cex = 0.8)
par(ask = TRUE)
# predicting input AMO, ENSO & PDO
AMO = AMO_pred_input
ENSO = ENSO_pred_input
PDO = PDO_pred_input
climate_index = cbind(AMO, ENSO, PDO)
climate_index = as.data.frame(climate_index)
# predict
best_fit_tree_pred = predict(best_fit_tree, climate_index, class = "tree")
# R^2 for prediction
R2 = (cor(LF_pred_input, best_fit_tree_pred))^2
# RMSE for prediction
N = length(LF_pred_input)
RMSE = sqrt((sum((best_fit_tree_pred - LF_pred_input)^2))/N)
# return prediction, R^2 & RMSE
result = list(best_fit_tree_pred = best_fit_tree_pred, R2 = R2, RMSE = RMSE)
return(result)
}
############################################################
# predict LF flow of 1906-1921 from regression tree of 1922-2006
y = best_fit_tree_epoch(LF_fit_input = LFvalue_22_06, AMO_fit_input = AMOvalue_22_06,
ENSO_fit_input = ENSOvalue_22_06, PDO_fit_input = PDOvalue_22_06, LF_pred_input = LFvalue_06_21,
AMO_pred_input = AMOvalue_06_21, ENSO_pred_input = ENSOvalue_06_21, PDO_pred_input = PDOvalue_06_21)
# plot observed & predicted values with 1:1 line; print R^2 & RMSE
plot(LFvalue_06_21, y$best_fit_tree_pred, main = "Observed vs Predicted LF flow 1906-1921")
abline(0, 1)
par(ask = TRUE)
cat("R^2 for predicting 1906-1921 = ", y$R2, "\n", "\n")
## R^2 for predicting 1906-1921 = 0.04497
##
cat("RMSE for predicting 1906-1921 = ", y$RMSE, "\n", "\n")
## RMSE for predicting 1906-1921 = 5.241
##
############################################################
# predict LF flow of 1975-1990 from regression tree of 1906-1994 & 1991-2006
y = best_fit_tree_epoch(LF_fit_input = LFvalue_06_74_91_06, AMO_fit_input = AMOvalue_06_74_91_06,
ENSO_fit_input = ENSOvalue_06_74_91_06, PDO_fit_input = PDOvalue_06_74_91_06,
LF_pred_input = LFvalue_75_90, AMO_pred_input = AMOvalue_75_90, ENSO_pred_input = ENSOvalue_75_90,
PDO_pred_input = PDOvalue_75_90)
# plot observed & predicted values with 1:1 line; print R^2 & RMSE
plot(LFvalue_75_90, y$best_fit_tree_pred, main = "Observed vs Predicted LF flow 1975-1990")
abline(0, 1)
par(ask = TRUE)
cat("R^2 for predicting 1975-1990 = ", y$R2, "\n", "\n")
## R^2 for predicting 1975-1990 = 0.1553
##
cat("RMSE for predicting 1975-1990 = ", y$RMSE, "\n", "\n")
## RMSE for predicting 1975-1990 = 5.408
##
############################################################
# predict LF flow of 1906-1921 from regression tree of 1922-2006
y = best_fit_tree_epoch(LF_fit_input = LFvalue_06_89, AMO_fit_input = AMOvalue_06_89,
ENSO_fit_input = ENSOvalue_06_89, PDO_fit_input = PDOvalue_06_89, LF_pred_input = LFvalue_90_06,
AMO_pred_input = AMOvalue_90_06, ENSO_pred_input = ENSOvalue_90_06, PDO_pred_input = PDOvalue_90_06)
# plot observed & predicted values with 1:1 line; print R^2 & RMSE
plot(LFvalue_90_06, y$best_fit_tree_pred, main = "Observed vs Predicted LF flow 1990-2006")
abline(0, 1)
par(ask = TRUE)
cat("R^2 for predicting 1990-2006 = ", y$R2, "\n", "\n")
## R^2 for predicting 1990-2006 = 0.01583
##
cat("RMSE for predicting 1990-2006 = ", y$RMSE, "\n", "\n")
## RMSE for predicting 1990-2006 = 5.234
##
############################################################
# Bonus
# fit tree to CO precip, predict on DEM grid
# data point in data set = 491
N = 491
# read data online dat =
# read.table('http://cires.colorado.edu/~aslater/CVEN_6833/colo_precip.dat',
# nrows = N)
# read data from pc
dat <- read.table(file = "C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 1\\colo_precip.dat",
nrows = N)
# load independent variable X and depedent variable Y
X = dat[, 1:3]
Y = dat[, 4]
# Name Lat, Lon, Alt as independent variables
Lat = X[, 1]
Lon = X[, 2]
Alt = X[, 3]
# Name Ppt as dependent variable
Ppt = Y
############################################################
# fit tree
lat_lon_alt = cbind(Lat, Lon, Alt)
lat_lon_alt = as.data.frame(lat_lon_alt)
# fit tree
best_fit_tree_ppt = tree(Ppt ~ ., lat_lon_alt)
# plot tree
plot(best_fit_tree_ppt)
text(best_fit_tree_ppt, cex = 0.8)
par(ask = TRUE)
############################################################
# load DEM grid
# 1705 points
N = 1705
# read in grid dat =
# read.table('http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat',
# nrows = N)
# read data from pc
dat <- read.table(file = "C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 1\\colo_dem.dat",
nrows = N)
# load independent variable X
X = dat[, 1:3]
# Name Lat, Lon, Alt as independent variables
Lat = X[, 1]
Lon = X[, 2]
Alt = X[, 3]
# predict with new data
lat_lon_alt = cbind(Lat, Lon, Alt)
lat_lon_alt = as.data.frame(lat_lon_alt)
# predict
best_fit_tree_ppt_pred = predict(best_fit_tree_ppt, lat_lon_alt, class = "tree")
############################################################
library(lattice)
## Warning: package 'lattice' was built under R version 3.0.2
# plot level plots
levelpic = levelplot(best_fit_tree_ppt_pred ~ Lon + Lat, data = lat_lon_alt,
cuts = 20, region = TRUE, main = "Predicted CO ppt using regression tree")
print(levelpic)
############################################################