# HW 3 Q 1 JLCY, 19 Dec 2013

############################################################ 

# part i

# use CART finding best regression tree for Lees Ferry with AMO, ENSO & PDO

# read Less Ferry data online
# LFdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/LeesFerry-monflows-1906-2006.txt'),
# ncol=13, byrow=T)

# read from pc
LFdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\LeesFerry-monflows-1906-2006.txt"), 
    ncol = 13, byrow = T)

# read AMO data online
# AMOdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/AMO_1856-2011.txt'),
# ncol=2, byrow=T)

# read from pc
AMOdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\AMO_1856-2011.txt"), 
    ncol = 2, byrow = T)

# read ENSO data online
# ENSOdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/nino3_1905-2007.txt'),
# ncol=2, byrow=T)

# read from pc
ENSOdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\nino3_1905-2007.txt"), 
    ncol = 2, byrow = T)

# read PDO data online
# PDOdata=matrix(scan('http://civil.colorado.edu/~balajir/CVEN6833/R-sessions/session3/files-4HW3/PDO_1900-2011.txt'),
# ncol=2, byrow=T)

# read from pc
PDOdata = matrix(scan("C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\R session 3\\4HW3\\PDO_1900-2011.txt"), 
    ncol = 2, byrow = T)

############################################################ 

# extract data from 1906 to 2006

ntime = 101  # 1906-2006

# LF value - annual sum
LFvalue = LFdata[, 2:13]  # all month data
LFvalue = apply(LFvalue, 1, sum)  # row sum

# Divide by 10^6 to get them into Million Acre Feet (MaF)
LFvalue = LFvalue/1e+06

# AMO value
i = 1
for (i in 1:length(AMOdata[, 1])) {
    if (AMOdata[i, 1] == 1906) {
        # starts at 1906
        AMOvalue = AMOdata[i:(ntime + i - 1), 2]  # ends at 2006
    }
}

# ENSO value
i = 1
for (i in 1:length(ENSOdata[, 1])) {
    if (ENSOdata[i, 1] == 1906) {
        ENSOvalue = ENSOdata[i:(ntime + i - 1), 2]
    }
}

# PDO value
i = 1
for (i in 1:length(PDOdata[, 1])) {
    if (PDOdata[i, 1] == 1906) {
        PDOvalue = PDOdata[i:(ntime + i - 1), 2]
    }
}

############################################################ 

# fitting using package 'rpart'

library(rpart)

fit <- rpart(LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, method = "anova", data = data.frame(LFvalue))

printcp(fit)  # display the results 
## 
## Regression tree:
## rpart(formula = LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, data = data.frame(LFvalue), 
##     method = "anova")
## 
## Variables actually used in tree construction:
## [1] AMOvalue  ENSOvalue
## 
## Root node error: 1947/101 = 19
## 
## n= 101 
## 
##      CP nsplit rel error xerror xstd
## 1 0.142      0      1.00   1.03 0.12
## 2 0.069      1      0.86   0.94 0.11
## 3 0.039      2      0.79   0.99 0.12
## 4 0.028      4      0.71   1.00 0.12
## 5 0.027      5      0.68   1.02 0.13
## 6 0.015      6      0.66   1.03 0.13
## 7 0.010      7      0.64   1.06 0.13

plotcp(fit)  # visualize cross-validation results 

plot of chunk unnamed-chunk-1

par(ask = TRUE)

# summary(fit) # detailed summary of splits

# create additional plots
par(mfrow = c(1, 2))  # two plots on one page 
rsq.rpart(fit)  # visualize cross-validation results  \t
## 
## Regression tree:
## rpart(formula = LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, data = data.frame(LFvalue), 
##     method = "anova")
## 
## Variables actually used in tree construction:
## [1] AMOvalue  ENSOvalue
## 
## Root node error: 1947/101 = 19
## 
## n= 101 
## 
##      CP nsplit rel error xerror xstd
## 1 0.142      0      1.00   1.03 0.12
## 2 0.069      1      0.86   0.94 0.11
## 3 0.039      2      0.79   0.99 0.12
## 4 0.028      4      0.71   1.00 0.12
## 5 0.027      5      0.68   1.02 0.13
## 6 0.015      6      0.66   1.03 0.13
## 7 0.010      7      0.64   1.06 0.13

plot of chunk unnamed-chunk-1


par(ask = TRUE)

par(mfrow = c(1, 1))  # 1 plot on one page 

# plot tree
plot(fit, uniform = TRUE, main = "Regression Tree for LF flow using 'rpart'")
text(fit, use.n = TRUE, all = TRUE, cex = 0.9)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# find cp value that mimiizes CV error
min_cp = fit$cptable[which.min(fit$cptable[, "xerror"]), "CP"]

# prune the tree
pfit = prune(fit, cp = min_cp)  # from cptable   

# plot the pruned tree
plot(pfit, uniform = TRUE, main = "Pruned Regression Tree for LF flow")
text(pfit, use.n = TRUE, all = TRUE, cex = 0.8)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# fitting using package 'tree'

library(tree)
## Warning: package 'tree' was built under R version 3.0.2

# function fit_tree input different indices to find total residual deviance
# fitting input: combinations of AMOvalue, ENSOvalue, PDOvalue

fit_tree = function(LFvalue, dframe) {

    dframe = data.frame(dframe)

    # fit tree
    fit_tree = tree(LFvalue ~ ., data = dframe)

    # total residual deviance
    trdev = sum(sapply(resid(fit_tree), function(x) (x - mean(resid(fit_tree)))^2))

    # return
    result = list(fit_tree = fit_tree, trdev = trdev)

    return(result)
}

############################################################ 

# input different combinations of indices

# only AMO
dframe = cbind(AMOvalue)

fit_tree1 = fit_tree(LFvalue, dframe)

############################################################ 

# only ENSO
dframe = cbind(ENSOvalue)

fit_tree2 = fit_tree(LFvalue, dframe)

############################################################ 

# only PDO
dframe = cbind(PDOvalue)

fit_tree3 = fit_tree(LFvalue, dframe)

############################################################ 

# only AMO & ENSO
dframe = cbind(AMOvalue, ENSOvalue)

fit_tree4 = fit_tree(LFvalue, dframe)

############################################################ 

# only AMO & PDO
dframe = cbind(AMOvalue, PDOvalue)

fit_tree5 = fit_tree(LFvalue, dframe)

############################################################ 

# only ENSO & PDO
dframe = cbind(ENSOvalue, PDOvalue)

fit_tree6 = fit_tree(LFvalue, dframe)

############################################################ 

# All 3: AMO ENSO & PDO
dframe = cbind(AMOvalue, ENSOvalue, PDOvalue)

fit_tree7 = fit_tree(LFvalue, dframe)

############################################################ 

# find tree with least deviance and set to best_fit_tree

# group deviances
list_tree_dev = cbind(fit_tree1$trdev, fit_tree2$trdev, fit_tree3$trdev, fit_tree4$trdev, 
    fit_tree5$trdev, fit_tree6$trdev, fit_tree7$trdev)
min_list_tree_dev = min(list_tree_dev)  # min of list
where_min_list_tree_dev = match(min_list_tree_dev, list_tree_dev)  # position of min dev

# find out best_fit_tree with lowest deviance
if (where_min_list_tree_dev == 1) best_fit_tree = fit_tree1$fit_tree
if (where_min_list_tree_dev == 2) best_fit_tree = fit_tree2$fit_tree
if (where_min_list_tree_dev == 3) best_fit_tree = fit_tree3$fit_tree
if (where_min_list_tree_dev == 4) best_fit_tree = fit_tree4$fit_tree
if (where_min_list_tree_dev == 5) best_fit_tree = fit_tree5$fit_tree
if (where_min_list_tree_dev == 6) best_fit_tree = fit_tree6$fit_tree
if (where_min_list_tree_dev == 7) best_fit_tree = fit_tree7$fit_tree

############################################################ 

# plot tree
plot(best_fit_tree)
text(best_fit_tree, cex = 0.8)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# pruning - evaluating error on training data

# Sequence of pruned tree sizes/errors
best_fit_tree.seq = prune.tree(best_fit_tree)

# Plots size vs. error
plot(best_fit_tree.seq)

plot of chunk unnamed-chunk-1


# Vector of error rates for prunings, in order
best_fit_tree.seq$dev
##  [1]  959.0  980.3 1038.9 1101.2 1134.8 1187.5 1247.3 1384.2 1537.3 1670.8
## [11] 1947.0

par(ask = TRUE)

# Positions of optimal (with respect to error) trees
opt.trees = which(best_fit_tree.seq$dev == min(best_fit_tree.seq$dev))

# Size of smallest optimal tree
optimal_size = min(best_fit_tree.seq$size[opt.trees])  # = 15

cat("Size of smallest optimal tree = ", optimal_size, "\n", "\n")
## Size of smallest optimal tree =  15 
## 

############################################################ 

# CV

# fit tree
best_fit_tree = tree(LFvalue ~ AMOvalue + ENSOvalue + PDOvalue, data = data.frame(LFvalue))

best_fit_tree_cv = cv.tree(best_fit_tree)

plot(best_fit_tree_cv)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# part ii

# separate different data subsets

# 1906 - 1921: predict

LFvalue_06_21 = LFvalue[1:16]
AMOvalue_06_21 = AMOvalue[1:16]
ENSOvalue_06_21 = ENSOvalue[1:16]
PDOvalue_06_21 = PDOvalue[1:16]

# 1922 - 2006: fit

LFvalue_22_06 = LFvalue[17:101]
AMOvalue_22_06 = AMOvalue[17:101]
ENSOvalue_22_06 = ENSOvalue[17:101]
PDOvalue_22_06 = PDOvalue[17:101]

############################################################ 

# 1975 - 1990: predict

LFvalue_75_90 = LFvalue[70:85]
AMOvalue_75_90 = AMOvalue[70:85]
ENSOvalue_75_90 = ENSOvalue[70:85]
PDOvalue_75_90 = PDOvalue[70:85]

# 1906 - 1974 & 1991 - 2006: fit

LFvalue_06_74_91_06 = append(LFvalue[1:69], LFvalue[86:101])
AMOvalue_06_74_91_06 = append(AMOvalue[1:69], AMOvalue[86:101])
ENSOvalue_06_74_91_06 = append(ENSOvalue[1:69], ENSOvalue[86:101])
PDOvalue_06_74_91_06 = append(PDOvalue[1:69], PDOvalue[86:101])

############################################################ 

# 1990 - 2006: predict

LFvalue_90_06 = LFvalue[85:101]
AMOvalue_90_06 = AMOvalue[85:101]
ENSOvalue_90_06 = ENSOvalue[85:101]
PDOvalue_90_06 = PDOvalue[85:101]

# 1906 - 1989: fit

LFvalue_06_89 = LFvalue[1:84]
AMOvalue_06_89 = AMOvalue[1:84]
ENSOvalue_06_89 = ENSOvalue[1:84]
PDOvalue_06_89 = PDOvalue[1:84]

############################################################ 

# function best_fit_tree_epoch predict LF flow of epoch from regression tree
# of outside epoch fitting input: LF, AMO, ENSO, PDO predicting input: LF,
# AMO, ENSO, PDO

best_fit_tree_epoch = function(LF_fit_input, AMO_fit_input, ENSO_fit_input, 
    PDO_fit_input, LF_pred_input, AMO_pred_input, ENSO_pred_input, PDO_pred_input) {

    # fitting input AMO, ENSO & PDO
    AMO = AMO_fit_input
    ENSO = ENSO_fit_input
    PDO = PDO_fit_input

    climate_index = cbind(AMO, ENSO, PDO)

    climate_index = as.data.frame(climate_index)

    # fit tree
    best_fit_tree = tree(LF_fit_input ~ ., climate_index)

    # plot tree
    plot(best_fit_tree)
    text(best_fit_tree, cex = 0.8)

    par(ask = TRUE)

    # predicting input AMO, ENSO & PDO
    AMO = AMO_pred_input
    ENSO = ENSO_pred_input
    PDO = PDO_pred_input

    climate_index = cbind(AMO, ENSO, PDO)

    climate_index = as.data.frame(climate_index)

    # predict
    best_fit_tree_pred = predict(best_fit_tree, climate_index, class = "tree")

    # R^2 for prediction
    R2 = (cor(LF_pred_input, best_fit_tree_pred))^2

    # RMSE for prediction
    N = length(LF_pred_input)
    RMSE = sqrt((sum((best_fit_tree_pred - LF_pred_input)^2))/N)

    # return prediction, R^2 & RMSE
    result = list(best_fit_tree_pred = best_fit_tree_pred, R2 = R2, RMSE = RMSE)

    return(result)
}

############################################################ 

# predict LF flow of 1906-1921 from regression tree of 1922-2006

y = best_fit_tree_epoch(LF_fit_input = LFvalue_22_06, AMO_fit_input = AMOvalue_22_06, 
    ENSO_fit_input = ENSOvalue_22_06, PDO_fit_input = PDOvalue_22_06, LF_pred_input = LFvalue_06_21, 
    AMO_pred_input = AMOvalue_06_21, ENSO_pred_input = ENSOvalue_06_21, PDO_pred_input = PDOvalue_06_21)

plot of chunk unnamed-chunk-1


# plot observed & predicted values with 1:1 line; print R^2 & RMSE

plot(LFvalue_06_21, y$best_fit_tree_pred, main = "Observed vs Predicted LF flow 1906-1921")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

cat("R^2 for predicting 1906-1921 = ", y$R2, "\n", "\n")
## R^2 for predicting 1906-1921 =  0.04497 
## 
cat("RMSE for predicting 1906-1921 = ", y$RMSE, "\n", "\n")
## RMSE for predicting 1906-1921 =  5.241 
## 

############################################################ 

# predict LF flow of 1975-1990 from regression tree of 1906-1994 & 1991-2006

y = best_fit_tree_epoch(LF_fit_input = LFvalue_06_74_91_06, AMO_fit_input = AMOvalue_06_74_91_06, 
    ENSO_fit_input = ENSOvalue_06_74_91_06, PDO_fit_input = PDOvalue_06_74_91_06, 
    LF_pred_input = LFvalue_75_90, AMO_pred_input = AMOvalue_75_90, ENSO_pred_input = ENSOvalue_75_90, 
    PDO_pred_input = PDOvalue_75_90)

plot of chunk unnamed-chunk-1


# plot observed & predicted values with 1:1 line; print R^2 & RMSE

plot(LFvalue_75_90, y$best_fit_tree_pred, main = "Observed vs Predicted LF flow 1975-1990")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

cat("R^2 for predicting 1975-1990 = ", y$R2, "\n", "\n")
## R^2 for predicting 1975-1990 =  0.1553 
## 
cat("RMSE for predicting 1975-1990 = ", y$RMSE, "\n", "\n")
## RMSE for predicting 1975-1990 =  5.408 
## 

############################################################ 

# predict LF flow of 1906-1921 from regression tree of 1922-2006

y = best_fit_tree_epoch(LF_fit_input = LFvalue_06_89, AMO_fit_input = AMOvalue_06_89, 
    ENSO_fit_input = ENSOvalue_06_89, PDO_fit_input = PDOvalue_06_89, LF_pred_input = LFvalue_90_06, 
    AMO_pred_input = AMOvalue_90_06, ENSO_pred_input = ENSOvalue_90_06, PDO_pred_input = PDOvalue_90_06)

plot of chunk unnamed-chunk-1


# plot observed & predicted values with 1:1 line; print R^2 & RMSE

plot(LFvalue_90_06, y$best_fit_tree_pred, main = "Observed vs Predicted LF flow 1990-2006")
abline(0, 1)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

cat("R^2 for predicting 1990-2006 = ", y$R2, "\n", "\n")
## R^2 for predicting 1990-2006 =  0.01583 
## 
cat("RMSE for predicting 1990-2006 = ", y$RMSE, "\n", "\n")
## RMSE for predicting 1990-2006 =  5.234 
## 

############################################################ 

# Bonus

# fit tree to CO precip, predict on DEM grid

# data point in data set = 491
N = 491

# read data online dat =
# read.table('http://cires.colorado.edu/~aslater/CVEN_6833/colo_precip.dat',
# nrows = N)

# read data from pc
dat <- read.table(file = "C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 1\\colo_precip.dat", 
    nrows = N)

# load independent variable X and depedent variable Y
X = dat[, 1:3]
Y = dat[, 4]

# Name Lat, Lon, Alt as independent variables
Lat = X[, 1]
Lon = X[, 2]
Alt = X[, 3]

# Name Ppt as dependent variable
Ppt = Y

############################################################ 

# fit tree
lat_lon_alt = cbind(Lat, Lon, Alt)

lat_lon_alt = as.data.frame(lat_lon_alt)

# fit tree
best_fit_tree_ppt = tree(Ppt ~ ., lat_lon_alt)

# plot tree
plot(best_fit_tree_ppt)
text(best_fit_tree_ppt, cex = 0.8)

plot of chunk unnamed-chunk-1


par(ask = TRUE)

############################################################ 

# load DEM grid

# 1705 points
N = 1705

# read in grid dat =
# read.table('http://cires.colorado.edu/~aslater/CVEN_6833/colo_dem.dat',
# nrows = N)

# read data from pc
dat <- read.table(file = "C:\\Users\\jfklee90\\Desktop\\CVEN 6833\\HW 1\\colo_dem.dat", 
    nrows = N)

# load independent variable X
X = dat[, 1:3]

# Name Lat, Lon, Alt as independent variables
Lat = X[, 1]
Lon = X[, 2]
Alt = X[, 3]

# predict with new data
lat_lon_alt = cbind(Lat, Lon, Alt)

lat_lon_alt = as.data.frame(lat_lon_alt)

# predict
best_fit_tree_ppt_pred = predict(best_fit_tree_ppt, lat_lon_alt, class = "tree")

############################################################ 

library(lattice)
## Warning: package 'lattice' was built under R version 3.0.2

# plot level plots
levelpic = levelplot(best_fit_tree_ppt_pred ~ Lon + Lat, data = lat_lon_alt, 
    cuts = 20, region = TRUE, main = "Predicted CO ppt using regression tree")
print(levelpic)

plot of chunk unnamed-chunk-1


############################################################