# R Project for Civil and Environmental Engineering Applications

# CVEN 5454

# A. Michael Bauer SID: 810 235 807

# May 10th, 2013

library(knitr)
library(e1071)
## Loading required package: class

# Problem 1. Lee's Ferry Basic Statistics and Boxplot

lf = read.table("http://civil.colorado.edu/~balajir/r-session-files/Leesferry-mon-data.txt")
boxplot(lf[, 2:13], axes = F)
axis(2)
axis(1, at = 1:12, labels = month.abb)
box()
points(1:12, apply(lf[, 2:13], 2, mean), pch = 19, col = "red")

plot of chunk unnamed-chunk-1


### Problem 2. Lee's Ferry Flow Data - Mean, Variance, Skew and Each
### Robust Counterpart

myqcskewness <- function(x) {
    x25 <- as.double(quantile(x, prob = 0.25))
    x75 <- as.double(quantile(x, prob = 0.75))
    qcsk <- (x25 + x75 - 2 * mean(x))/(x75 - x25)
    qcsk
}
lfmean = apply(lf[, 2:13], 2, mean)
lfvar = apply(lf[, 2:13], 2, var)
lfskew = apply(lf[, 2:13], 2, skewness)
lfmed = apply(lf[, 2:13], 2, median)
lfIQR = apply(lf[, 2:13], 2, IQR)
lfqcskew = apply(lf[, 2:13], 2, myqcskewness)
plot(lfmean, type = "l", main = "Mean Flow - Lee's Ferry", xlab = "Occurence", 
    ylab = "Mean Flow Rate")

plot of chunk unnamed-chunk-1

plot(lfvar, type = "l", main = "Flow Variance - Lee's Ferry", xlab = "Occurence", 
    ylab = "Flow Rate Variance", col = "red")

plot of chunk unnamed-chunk-1

plot(lfskew, type = "l", main = "Skew of Flow - Lee's Ferry", xlab = "Occurence", 
    ylab = "Skew of Flow Rate", col = "blue")

plot of chunk unnamed-chunk-1

plot(lfmed, type = "l", main = "Median Flow - Lee's Ferry", xlab = "Occurence", 
    ylab = "Median Flow Rate", col = "green")

plot of chunk unnamed-chunk-1

plot(lfIQR, type = "l", main = "IQR of Flow - Lee's Ferry", xlab = "Occurence", 
    ylab = "IQR of Flow Rate", col = "green")

plot of chunk unnamed-chunk-1

plot(lfqcskew, type = "l", main = "QC Skew of Flow - Lee's Ferry", xlab = "Occurence", 
    ylab = "QC Skew of Flow Rate", col = "red")

plot of chunk unnamed-chunk-1


### Problem 3. Lee's Ferry Flow Data with Overlaid PDFs of Fitted Models

library(ggplot2)
library(MASS)
for (i in 2:13) {
    x = lf[, i]
    fitnormpdf = dnorm(sort(x), mean(x), sd(x))
    gammax = fitdistr(x/10000, "gamma")
    fitgammapdf = dgamma(sort(x/10000), shape = gammax$estimate[1], rate = gammax$estimate[2])/10000
    if (i == 9) {
        weibullx = fitdistr(x/10000, "weibull")
        fitweibullpdf = dweibull(sort(x/10000), shape = weibullx$estimate[1], 
            scale = weibullx$estimate[2])/10000
    } else {
        weibullx = fitdistr(x/10000, "weibull")
        fitweibullpdf = dweibull(sort(x/10000), shape = weibullx$estimate[1], 
            scale = weibullx$estimate[2])/10000
    }

    fitlognorm = dlnorm(sort(x), mean(log(x)), sd(log(x)))
    hist(x, prob = T, main = month.name[(i - 1)])
    lines(sort(x), fitnormpdf, col = "blue")
    lines(sort(x), fitgammapdf, col = "red")
    lines(sort(x), fitweibullpdf, col = "green")
    lines(sort(x), fitlognorm, col = "black")
    legend("topright", col = c("blue", "red", "green", "black"), legend = c("Normal", 
        "Gamma", "Weibull", "Lognormal"), lty = 1, lwd = 1)
}
## Warning: NaNs produced
## Warning: NaNs produced
## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced
## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced
## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1

## Warning: NaNs produced

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1


# Problem 4. Highly Skewed Lee's Ferry Flow Data - Fitting Models

library(ADGofTest)
library(BSDA)
## Loading required package: lattice
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
## 
## Orange

i = 3
x = lf[, i]
fitnormpdf = dnorm(sort(x), mean(x), sd(x))
gammax = fitdistr(x/10000, "gamma")
fitgammapdf = dgamma(sort(x/10000), shape = gammax$estimate[1], rate = gammax$estimate[2])/10000
fitlognorm = dlnorm(sort(x), mean(log(x)), sd(log(x)))
hist(x, prob = T, main = month.name[(i - 1)])
lines(sort(x), fitnormpdf, col = "blue")
lines(sort(x), fitgammapdf, col = "red")
lines(sort(x), fitweibullpdf, col = "green")
lines(sort(x), fitlognorm, col = "black")
legend("topright", col = c("blue", "red", "green", "black"), legend = c("Normal", 
    "Gamma", "Weibull", "Lognormal"), lty = 1, lwd = 1)

plot of chunk unnamed-chunk-1


# Problem 4a. Goodness of Fit tests for Various Distributions on February
# flow data

# H0: Sample data are consistent with the model distribution H1: Sample
# data are *not* consistent with the model distribution Alpha = 0.05 =
# significance level

x = lf[, 3]

# 4a.i) K-S test for goodness of fit of Normal

zznorm = ks.test(x, "pnorm", mean = mean(x), sd = sd(x))
zznorm$p.value
## [1] 0.1408

# Since p-value=0.14 > 0.05=alpha, we do not reject the null hypothesis.
# The sample data are consistent with the normal distribution at 95%
# significance.  This does not fully agrees with the visual check of
# normal distributaion against the positively skewed histogram.

## 4a.ii) K-S test for goodness of fit of lognormal
zzlnorm = ks.test(x, "plnorm", mean = mean(log(x)), sd = sd(log(x)))
zzlnorm$p.value
## [1] 0.6846

# Since p-value=0.68 > 0.05=alpha, we do not reject the null hypothesis.
# The sample data is consistent with the lognormal distribution at 95%
# significance.

# 4a.iii) K-S test for Gamma
zgamma = fitdistr(x/10000, dgamma, list(shape = 1, rate = 0.1), lower = 0.01)
zgamma
##     shape       rate  
##   19.29829    0.48937 
##  ( 2.77594) ( 0.07131)
zzgamma = ks.test(x/10000, "pgamma", shape = zgamma$estimate[1], scale = 1/zgamma$estimate[2])
zzgamma
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x/10000
## D = 0.0855, p-value = 0.4648
## alternative hypothesis: two-sided

# Since p-value=0.48 > 0.05=alpha, we do not reject the null hypothesis.
# The sample data is consistent with the Gamma distribution at 95%
# significance.

# 4a.iv) K-S test for Weibull
zweibull = fitdistr(x/10000, "weibull", list(shape = 1, scale = 0.1), lower = 0.001)
zzweib = ks.test(x/10000, "pweibull", shape = zweibull$estimate[1], scale = zweibull$estimate[2])
zzweib
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x/10000
## D = 0.1309, p-value = 0.07026
## alternative hypothesis: two-sided

# Since p-value=0.07 > 0.05=alpha, we do not reject the null hypothesis.
# The sample data is consistent with the Weibull distribution at 95%
# significance.  However, the data are *not* consistent with the model at
# 90% significance, i.e., p=0.07<0.1=alpha.

# 4a.v) A-D test for Normal Distribution
adnorm = ad.test(x, pnorm, mean = mean(x), sd = sd(x))
adnorm$p.value
##      AD 
## 0.06112

# Since p-value=0.06 > 0.05=alpha, we do not reject the null hypothesis at
# 95% significance.  The sample data are consistent with the normal
# distribution; however, at 90% sifnificance, the sample data are *not*
# consistent with the distribution, i.e. at alpha=0.1

# 4a.vi) A-D test for Lognormal Distribution

# 4a.vi) A-D test for Lognormal Distribution
adlnorm = ad.test(log(x), plnorm, mean = mean(x), sd = sd(x))
adlnorm$p.value
##        AD 
## 6.316e-06

# Since p-value=6.3^-6 << 0.05=alpha, we reject the null hypothesis.  The
# transformed sample data (log(x)) are not consistent with the lognnormal
# distribution at 95% confidence.

# 4a.vii) A-D test for Gamma Distribution
adgamma = ad.test(x, pgamma, shape = zgamma$estimate[1], scale = 1/zgamma$estimate[2])
adgamma$p.value
##        AD 
## 6.316e-06

# Since p-value=6.3^-6 << 0.05=alpha, we reject the null hypothesis.  The
# sample data are not consistent with the Gamma distribution at any
# reasonable significance level

# 4a.viii) A-D test for Weibull Distribution
adweibull = ad.test(x, pweibull, shape = zweibull$estimate[1], scale = zweibull$estimate[2])
adweibull$p.value
##        AD 
## 6.316e-06

# Since p-value=6.3^-6 << 0.05=alpha, we reject the null hypothesis.  The
# sample data are not consistent with the Gamma distribution

# 4a.ix) Chi-Square test of Goodness of Fit for Normal Distribution
chisq.test(lf[, 3], p = fitnormpdf/sum(fitnormpdf))
## 
##  Chi-squared test for given probabilities
## 
## data:  lf[, 3]
## X-squared = 1.026e+09, df = 94, p-value < 2.2e-16

# Since p-value=2.2^-16 << 0.05=alpha, we reject the null hypothesis.  The
# sample data are not consistent with the normal distribution

# 4a.x) Chi-Square test of Goodness of Fit for Lognormal Distribution
chisq.test(x, p = fitlognorm/sum(fitlognorm))
## 
##  Chi-squared test for given probabilities
## 
## data:  x
## X-squared = 162080488, df = 94, p-value < 2.2e-16
# Since p-value=2.2^-16 << 0.05=alpha, we reject the null hypothesis.  The
# sample data are not consistent with the lognormal distribution

# 4a.xi) Chi-Square test of Goodness of Fit for Gamma Distribution
chisq.test(x, p = fitgammapdf/sum(fitgammapdf))
## 
##  Chi-squared test for given probabilities
## 
## data:  x
## X-squared = 2.82e+08, df = 94, p-value < 2.2e-16

# Since p-value=2.2^-16 << 0.05=alpha, we reject the null hypothesis.  The
# sample data are not consistent with the Gamma distribution

# 4a.xii) Chi-Square test of Goodness of Fit for Weibull Distribution
chisq.test(x, p = fitweibullpdf/sum(fitweibullpdf))
## 
##  Chi-squared test for given probabilities
## 
## data:  x
## X-squared = 165095082, df = 94, p-value < 2.2e-16

# Since p-value=2.2^-16 << 0.05=alpha, we reject the null hypothesis.  The
# sample data are not consistent with the Weibull distribution

# 4 b. Plots of Model Quantiles vs. Sample Quantiles

# 4b. i) QQ Plot of Normal Distribution versus Sample Distribution

qqplot(fitnormpdf, x, main = "QQ Plot of Fitted Normal versus Empirical Quantiles", 
    xlab = "Fitted Normal Quantiles", ylab = "Sample Quantiles")

plot of chunk unnamed-chunk-1


# As we saw in the Goodness of Fit tests, the sample data does not match
# well with the Normal Distribution.

# 4b. ii) QQ Plot of Lognormal Distribution versus Sample Distribution

qqplot(fitlognorm, log(x), main = "QQ Plot of Fitted Lognormal versus Empirical Quantiles", 
    xlab = "Fitted Lognormal Quantiles", ylab = "Sample Log Quantiles")

plot of chunk unnamed-chunk-1


# As we saw in the Goodness of Fit tests, the sample data does not match
# well with the Lognormal Distribution.

# 4b. iii) QQ Plot of Gamma Distribution versus Sample Distribution

qqplot(fitgammapdf, x, main = "QQ Plot of Fitted Gamma versus Empirical Quantiles", 
    xlab = "Fitted Gamma Quantiles", ylab = "Sample Quantiles")

plot of chunk unnamed-chunk-1

qqplot(fitweibullpdf, x, main = "QQ Plot of Fitted Weibull versus Empirical Quantiles", 
    xlab = "Fitted Weibull Quantiles", ylab = "Sample Quantiles")

plot of chunk unnamed-chunk-1


# The quantile plots also indicate that we should reject the null
# hypothesis: sample data does not match well with the distributions.

# Problem 5. Nonparametric vs. Parametric PDF of May, August and December
# Flows at Lee's Ferry
library(sm)
## Package `sm', version 2.2-4.1 Copyright (C) 1997, 2000, 2005, 2007, 2008,
## A.W.Bowman & A.Azzalini Type help(sm) for summary information

may = lf[, 6]
aug = lf[, 9]
dec = lf[, 13]

# 5 a) Bootstrap PDFs of Monthly Flows
mayeval = seq(min(may) - sd(may), max(may) + sd(may), length = 100)
augeval = seq(min(aug) - sd(aug), max(aug) + sd(aug), length = 100)
deceval = seq(min(dec) - sd(dec), max(dec) + sd(dec), length = 100)
maypdf = sm.density(may, eval.points = mayeval, display = "none")$estimate
augpdf = sm.density(aug, eval.points = augeval, display = "none")$estimate
decpdf = sm.density(dec, eval.points = deceval, display = "none")$estimate
fitnormpdf.may = dnorm(sort(may), mean(may), sd(may))
hist(may, prob = T, main = "May Lee's Ferry Flow", xlab = "Flow Rate", ylab = "PDF")
legend("topright", legend = c("Normal", "Actual"), lty = c(2, 1), lwd = 1)
lines(mayeval, maypdf, lty = 1)
lines(sort(may), fitnormpdf.may, lty = 2)

plot of chunk unnamed-chunk-1

fitnormpdf.aug = dnorm(sort(aug), mean(aug), sd(aug))
hist(aug, prob = T, main = "August Lee's Ferry Flow", xlab = "Flow Rate", ylab = "PDF")
lines(augeval, augpdf, lty = 1)
lines(sort(aug), fitnormpdf.aug, lty = 2)
legend("topright", legend = c("Normal", "Actual"), lty = c(2, 1), lwd = 1)

plot of chunk unnamed-chunk-1

fitnormpdf.dec = dnorm(sort(dec), mean(dec), sd(dec))
hist(dec, prob = T, main = "December Lee's Ferry Flow", xlab = "Flow Rate", 
    ylab = "PDF")
lines(deceval, decpdf, lty = 1)
lines(sort(dec), fitnormpdf.dec, lty = 2)
legend("topright", legend = c("Normal", "Actual"), lty = c(2, 1), lwd = 1)

plot of chunk unnamed-chunk-1


# The Kernel-smoothed PDFs can be different from the fitted Normal PDFs.
# December most closely resembles a normal distribution;this may due to
# the cold temperatures and a lack of 'runoff' events, like in May, for
# example, where we see a bimodal distribution.

# Problem 6.

data = read.csv("C:/Users/Katita/Downloads/Old_faithful_data.csv", header = T)
xdata = data[, 1]

# The historical data is in the vector 'xdata'
N = length(xdata)  #number of data points.

# For Monte Carlo...
nsim = 500  #number of simulations, each of length N

# points at which to estimate the PDF from the simulations and the
# observed

xeval = seq(min(xdata) - sd(xdata), max(xdata) + sd(xdata), length = 100)
neval = length(xeval)
xsimpdf = matrix(0, nrow = nsim, ncol = neval)

# Vectors for the statistics
meansim = 1:nsim
mediansim = 1:nsim
sdsim = 1:nsim
skewsim = 1:nsim
iqrsim = 1:nsim

# H0: Sample data are consistent with the model distribution H1: Sample
# data are *not* consistent with the model distribution Alpha = 0.05 =
# significance level

# norm K-S TEST
zznorm = ks.test(xdata, "pnorm", mean = mean(xdata), sd = sd(xdata))
## Warning: ties should not be present for the Kolmogorov-Smirnov test
zznorm$p.value
## [1] 0.0001216
# p-value = 0.0001215503 < 0.05 .... reject null hypothesis

# log-norm K-S TEST
zzlnorm = ks.test(xdata, "plnorm", mean = mean(log(xdata)), sd = sd(log(xdata)))
## Warning: ties should not be present for the Kolmogorov-Smirnov test
zzlnorm$p.value
## [1] 3.8e-07
# p-value = 0 < 0.05 ... reject null hypothesis

# gamma K-S TEST
zgamma = fitdistr(xdata, dgamma, list(shape = 1, rate = 0.1), lower = 0.01)
zgamma
##     shape       rate  
##   25.16272    0.34809 
##  ( 2.04734) ( 0.02861)
zzgamma = ks.test(xdata, "pgamma", shape = zgamma$estimate[1], scale = 1/zgamma$estimate[2])
## Warning: ties should not be present for the Kolmogorov-Smirnov test
zzgamma
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  xdata
## D = 0.1503, p-value = 2.861e-06
## alternative hypothesis: two-sided
# p-value = 0 < 0.05 ... reject null hypothesis

# weibull K-S TEST
zweibull = fitdistr(xdata, "weibull", list(shape = 1, scale = 0.1), lower = 0.001)
zzweib = ks.test(xdata, "pweibull", shape = zweibull$estimate[1], scale = zweibull$estimate[2])
## Warning: ties should not be present for the Kolmogorov-Smirnov test
zzweib
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  xdata
## D = 0.0991, p-value = 0.005747
## alternative hypothesis: two-sided

# p-value = 0.005 < 0.05, therefore we reject null hypothesis However,
# this p-value is still the largest of the tests; as such, this is the
# distribution I will choose to fit.

# Model Estimation

## kernel PDf
xdensitykernel = sm.density(xdata, eval.points = xeval, display = "none")$estimate

# Weibull
xdensityorig = dweibull(xeval, shape = zweibull$estimate[1], scale = zweibull$estimate[2])

######## Simulation ##########

for (i in 1:nsim) {
    # simulate from Weibull..
    xsim = rweibull(N, shape = zweibull$estimate[1], scale = zweibull$estimate[2])


    # compute the statistics from the simulation
    meansim[i] = mean(xsim)
    sdsim[i] = sd(xsim)
    # sdsim[i]=sd(log(xsim))
    skewsim[i] = skewness(xsim)
    iqrsim[i] = diff(quantile(xsim, c(0.25, 0.75)))
    mediansim[i] = quantile(xsim, c(0.5))

    # estimate the PDF at the evaluation points based on the simulated data
    # replace with the appropriate distribution..

    # Weibull..
    zz = fitdistr(xsim, "weibull")
    xsimpdf[i, ] = dweibull(xeval, shape = zz$estimate[1], scale = zz$estimate[2])
}

# boxplot of stats from the simulations and overlay the corresponding
# value from the original data as a point.

par(mfrow = c(1, 5))

zz = boxplot(meansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, mean(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "Mean")

zz = boxplot(mediansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, quantile(xdata, c(0.5)), col = "red", cex = 1.25, pch = 16)
title(main = "Median")

zz = boxplot(sdsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
# points(z1,sd(log(xdata)),col='red',cex=1.25)
points(z1, sd(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "SD")

zz = boxplot(skewsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, skewness(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "Skew")

zz = boxplot(iqrsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25, ylim = range(c(iqrsim, diff(quantile(xdata, 
    c(0.25, 0.75))))))
points(z1, diff(quantile(xdata, c(0.25, 0.75))), col = "red", cex = 1.25, pch = 16)
title(main = "IQR")

plot of chunk unnamed-chunk-1


# par(ask=TRUE)

# boxplot of PDFs from the simulations along with that of the original
# data

par(mfrow = c(1, 1))
xs = 1:neval

# For R version 2.4.1
zz = boxplot(split(t(xsimpdf), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(xsimpdf, xdensityorig), xlab = "", ylab = "", cex = 1.25)

npts = 10  #number of points to plot on the x-axis..
n2 = round(neval * (1:npts)/npts)

z2 = z1[n2]
n1 = xeval[n2]


n1 = round(n1, dig = 2)
n1 = as.character(n1)

axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, col = "red")

lines(z1, xdensitykernel, col = "blue", lwd = 2)

title(main = "PDFs from the simulations and the historical data")

plot of chunk unnamed-chunk-1


# Problem 7. Non-Parametric Approach to Old Faithful Modelling

# For Monte Carlo...
nsim = 500  #number of simulations, each of length N

# Points at which to estimate the PDF from the simulations and the
# observed

xeval = seq(min(xdata) - sd(xdata), max(xdata) + sd(xdata), length = 100)
neval = length(xeval)
xsimpdf = matrix(0, nrow = nsim, ncol = neval)

# Vectors for Statistics
meansim = 1:nsim
mediansim = 1:nsim
sdsim = 1:nsim
skewsim = 1:nsim
iqrsim = 1:nsim

# Evaluate the Kernel density PDF at the evaluation points based on the
# data

xdensityorig = sm.density(xdata, eval.points = xeval, display = "none")$estimate

# Bandwidth of the data
band = hnorm(xdata)
######## Simulation ##########

for (i in 1:nsim) {

    xsim = sample(xdata, replace = T)
    meansim[i] = mean(xsim)
    sdsim[i] = sd(xsim)
    # sdsim[i]=sd(log(xsim))
    skewsim[i] = skewness(xsim)
    iqrsim[i] = diff(quantile(xsim, c(0.25, 0.75)))
    mediansim[i] = quantile(xsim, c(0.5))

    xsimpdf[i, ] = sm.density(xsim, eval.points = xeval, display = "none")$estimate

}

# Boxplot of statistics from the simulations, overlaid with the
# corresponding values from the original data.

par(mfrow = c(1, 5))

zz = boxplot(meansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, mean(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "Mean")

zz = boxplot(mediansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, quantile(xdata, c(0.5)), col = "red", cex = 1.25, pch = 16)
title(main = "Median")

zz = boxplot(sdsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
# points(z1,sd(log(xdata)),col='red',cex=1.25)
points(z1, sd(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "SD")

zz = boxplot(skewsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, skewness(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "Skew")

zz = boxplot(iqrsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, diff(quantile(xdata, c(0.25, 0.75))), col = "red", cex = 1.25, pch = 16)
title(main = "IQR")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

# Boxplot of PDFs from MC simulations and Original Data

par(mfrow = c(1, 1))
xs = 1:neval

# For R version 2.4.1
zz = boxplot(split(t(xsimpdf), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(xsimpdf, xdensityorig), xlab = "", ylab = "", cex = 1.25)

npts = 10  #number of points to plot on the x-axis..
n2 = round(neval * (1:npts)/npts)

z2 = z1[n2]
n1 = xeval[n2]


n1 = round(n1, dig = 2)
n1 = as.character(n1)

axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, col = "red", lwd = 2)

title(main = "PDFs from MC Simulations and Sample Data")

plot of chunk unnamed-chunk-1


# Problem 8. Investigating the relationship between inter-eruption time
# and the duration of eruptions at Old Faithful in Yellowstone National
# Park: Bivariate Histograms and PDFs

# 8.a) Bivariate Histograms: parametric and nonparametric models and
# empirical Histograms
library(gplots)
## Loading required package: gtools
## Attaching package: 'gtools'
## The following object is masked from 'package:e1071':
## 
## permutations
## Loading required package: gdata
## gdata: Unable to locate valid perl interpreter gdata: gdata: read.xls()
## will be unable to read Excel XLS and XLSX files gdata: unless the 'perl='
## argument is used to specify the location gdata: of a valid perl
## intrpreter. gdata: gdata: (To avoid display of this message in the future,
## please gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls() gdata: to support
## 'XLX' (Excel 97-2004) files.
## ```

gdata: Unable to load perl libaries needed by read.xls() gdata: to support

'XLSX' (Excel 2007+) files.


## gdata: Run the function 'installXLSXsupport()' gdata: to automatically
## download and install the perl gdata: libaries needed to support Excel XLS
## and XLSX formats.
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
## nobs
## The following object is masked from 'package:utils':
## 
## object.size
## Loading required package: caTools
## Loading required package: grid
## Loading required package: KernSmooth
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
## lowess
library(akima)
X = data[, 1]
Y = data[, 2]

h2d = hist2d(X, Y, show = FALSE)
persp(h2d$x, h2d$y, h2d$counts, ticktype = "detailed", theta = 30, phi = 30, 
    expand = 0.5, shade = 0.1, col = "cyan", ltheta = -30, xlab = "Intereruption Time", 
    ylab = "Duration (min.)", zlab = "Count")

plot of chunk unnamed-chunk-1


data = cbind(X, Y)

N = length(X)

xm = mean(X)
sigx = sd(X)
ym = mean(Y)
sigy = sd(Y)
rho = cor(X, Y)

nx = 60
ny = 50

nxy = nx * ny
xeval = seq(min(X), max(X), length = nx)
yeval = seq(min(Y), max(Y), length = ny)

jpdf = 1:nxy
xyeval = matrix(0, nrow = nxy, ncol = 2)

k = 0
for (j in 1:nx) {
    for (i in 1:ny) {
        k = k + 1
        xyeval[k, 1] = xeval[j]
        xyeval[k, 2] = yeval[i]
        nfact = 1/(2 * pi * sigx * sigy * sqrt(1 - rho^2))
        xs = (xyeval[k, 1] - xm)/sigx
        ys = (xyeval[k, 2] - ym)/sigy

        jpdf[k] = exp(-(xs * xs + ys * ys - 2 * rho * xs * ys)/(2 * (1 - rho^2)))
        jpdf[k] = jpdf[k] * nfact
    }
}

zz = interp(xyeval[, 1], xyeval[, 2], jpdf)

# I tried this: persp(zz,ticktype='detailed', theta=30, phi=30,
# expand=0.5, shade=0.5, col='cyan', ltheta=-30) But after consulting with
# others, ended up using this:

contour(zz, xlab = "Intereruption Time (min)", ylab = "Duration (min)")
points(X, Y)

plot of chunk unnamed-chunk-1


# Nonparametric Two-Dimensional PDF
xm = mean(X)
sigx = sd(X)
ym = mean(Y)
sigy = sd(Y)
rho = cor(X, Y)

nx = 50
ny = 50

nxy = nx * ny
xeval = seq(min(X), max(X), length = nx)
yeval = seq(min(Y), max(Y), length = ny)

jpdf = 1:nxy
xyeval = matrix(0, nrow = nxy, ncol = 2)
xyeval = expand.grid(x = xeval, y = yeval)

data = cbind(X, Y)
jpdf = sm.density(data, eval.points = xyeval, eval.grid = FALSE, display = "none")
## Loading required package: rgl
## Warning: there is no package called 'rgl'
## Loading required package: rpanel
## Warning: there is no package called 'rpanel'

zz = interp(xyeval[, 1], xyeval[, 2], jpdf$estimate)
contour(zz, xlab = "Intereruption Time (min)", ylab = "Duration (min)")
points(X, Y)

plot of chunk unnamed-chunk-1


## Intereruption time and Duration seem to have a fair linear correlation,
## with clustering of correlated points around short duration, short
## inter-eruption and long duration, long inter-eruptions.

# 8. b) Joint Data: Normal and Nonparametric Conditional PDFs of the Data

data = cbind(Y, X)
colnames(data) = c("X", "Y")

Yc = 57
X1 = sort(Y)
xyc = cbind(X1, rep(Yc, N))
zz = sm.density(data, eval.points = xyc, display = "none")
## Loading required package: rgl
## Warning: there is no package called 'rgl'
## Loading required package: rpanel
## Warning: there is no package called 'rpanel'
fxc = sm.density(X, eval.points = Yc, display = "none")
conpdf = diag(zz$estimate)/fxc$estimate
plot(X1, conpdf, type = "l", xlab = "Duration (min)", ylab = "Conditional PDF")
title(main = c("Conditional PDF - at ", Yc))

plot of chunk unnamed-chunk-1


Yc = 69
X1 = sort(Y)
xyc = cbind(X1, rep(Yc, N))
zz = sm.density(data, eval.points = xyc, display = "none")
## Loading required package: rgl
## Warning: there is no package called 'rgl'
## Loading required package: rpanel
## Warning: there is no package called 'rpanel'
fxc = sm.density(X, eval.points = Yc, display = "none")
conpdf = diag(zz$estimate)/fxc$estimate
plot(X1, conpdf, type = "l", xlab = "Duration (min)", ylab = "Conditional PDF")
title(main = c("Conditional PDF - at ", Yc))

plot of chunk unnamed-chunk-1


Yc = 85
X1 = sort(Y)
xyc = cbind(X1, rep(Yc, N))
zz = sm.density(data, eval.points = xyc, display = "none")
## Loading required package: rgl
## Warning: there is no package called 'rgl'
## Loading required package: rpanel
## Warning: there is no package called 'rpanel'
fxc = sm.density(X, eval.points = Yc, display = "none")
conpdf = diag(zz$estimate)/fxc$estimate
plot(X1, conpdf, type = "l", xlab = "Duration (min)", ylab = "Conditional PDF")
title(main = c("Conditional PDF - at ", Yc))

plot of chunk unnamed-chunk-1


## As one might expect, the conditional PDFs show higher probabilies
## around lower durations for the low category (<65), and higher
## probability of occurrence in the high category (>75). Interestingly,
## the intermediate category shows a bivariate distribution of probability
## around eruption duration, re-confirming the clustering we saw from the
## contour plots of the conditional PDFs.

# 8. c) Boxplots of the Created Models and Empirical Data: Visual Check

X = data[, 1]
Y = data[, 2]

cat1 = Y <= 65
cat2 = Y < 75 & Y > 65
cat3 = Y >= 75

X1 = X[cat1]
X2 = X[cat2]
X3 = X[cat3]

par(mfrow = c(1, 3))
boxplot(X1, main = "X <= 65")
boxplot(X2, main = "65 < X < 75")
boxplot(X3, main = "75 <= X")

plot of chunk unnamed-chunk-1

par(mfrow = c(1, 1))

# Problem 9. Linear Regression of Two Variables: Intereruption Time and
# Duration

X = data[, 1]
Y = data[, 2]

zz = lm(Y ~ X)

plot(X, zz$fit, type = "l")
points(X, Y)

plot of chunk unnamed-chunk-1


par(mfrow = c(2, 2))
plot(zz)

plot of chunk unnamed-chunk-1


### First, the residuals are showing a pattern of clustering for high and
### low values of y-hat.  Therfore, though we see good fit from the QQ
### plot, the model doesn't do a very good job of predicting.  The best
### way to correct this would be to do two separate linear regressions:
### one for inter-eruption time, and one for eruption duration. Then, one
### could compare their PDFs against a common scale, and do some other
### correlation tests.

# Problem 10. Simulations of Old Faithful Intereruption and Durations
# Times

# 10. a) Bootstrap simulation of intereruption time and conditional
# simulation of eruption duration using linear regression, 9(a)); repeat
# to generate same length as the observed data –

source("http://civil.colorado.edu/~balajir/CVEN5454/R-sessions/sess1/skew.r")
X = data[, 1]
Y = data[, 2]

## Bootstrap Simulations

N = length(X)
zmod = lm(Y ~ X)
Xboot = matrix(NA, nrow = N, ncol = 500)
Yboot = matrix(NA, nrow = N, ncol = 500)
meansim = 1:500
mediansim = 1:500
sdsim = 1:500
skewsim = 1:500
iqrsim = 1:500
for (i in 1:500) {

    xsm = sample(X, N)
    Xsim = X[xsm]
    yy = predict(zmod, as.data.frame(Xsim), se.fit = T)
    Yboot[, i] = yy$fit + rnorm(N, yy$se)
    Xboot[, i] = Xsim

    meansim[i] = mean(Yboot[, i])
    mediansim[i] = median(Yboot[, i])
    sdsim[i] = sd(Yboot[, i])
    skewsim[i] = skew(Yboot[, i])
    iqrsim[i] = diff(quantile(Yboot[, i], c(0.25, 0.75)))
}
## Warning: 'newdata' had 297 rows but variables found have 298 rows
## Error: number of items to replace is not a multiple of replacement length

## Diagnostics:

## Boxplots of Simulated Statistics
par(mfrow = c(1, 5))

zz = boxplot(meansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, mean(Y), col = "red", cex = 1.25, pch = 16)
title(main = "Mean")

zz = boxplot(mediansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25, ylim = range(c(mediansim, 
    median(Y))))
points(z1, quantile(Y, c(0.5)), col = "red", cex = 1.25, pch = 16)
title(main = "Median")

zz = boxplot(sdsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25, ylim = range(c(sdsim, sd(Y))))

points(z1, sd(Y), col = "red", cex = 1.25, pch = 16)
title(main = "SD")

zz = boxplot(skewsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, skew(Y), col = "red", cex = 1.25, pch = 16)
title(main = "Skew")

zz = boxplot(iqrsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25, ylim = range(c(iqrsim, diff(quantile(Y, 
    c(0.25, 0.75))))))
points(z1, diff(quantile(Y, c(0.25, 0.75))), col = "red", cex = 1.25, pch = 16)
title(main = "IQR")

plot of chunk unnamed-chunk-1



# PDF of Simulated Data
par(mfrow = c(1, 1))
plot(density(Y), type = "l", ylim = range(density(Y)$y))
for (i in 1:500) lines(density(Yboot[, i]))
## Error: 'x' contains missing values
lines(density(Y), col = "red", lwd = 2)

plot of chunk unnamed-chunk-1



# 10. b) Nonparametric Simulation of Data using Monte Carlo Analysis

N = length(X)  # number of data points

# Monte Carlo...
nsim = 500

xeval = seq(min(X) - sd(X), max(X) + sd(X), length = 100)
neval = length(xeval)
xsimpdf = matrix(0, nrow = nsim, ncol = neval)

meansim = 1:nsim
mediansim = 1:nsim
sdsim = 1:nsim
skewsim = 1:nsim
iqrsim = 1:nsim
xdensityorig = sm.density(X, eval.points = xeval, display = "none")$estimate

band = hnorm(X)

######## Simulation ##########

Xsim = matrix(NA, nrow = N, ncol = 500)
Ysim = matrix(NA, nrow = N, ncol = 500)

for (i in 1:nsim) {
    xsim = sample(X, replace = T)

    meansim[i] = mean(xsim)
    sdsim[i] = sd(xsim)
    skewsim[i] = skew(xsim)
    iqrsim[i] = diff(quantile(xsim, c(0.25, 0.75)))
    mediansim[i] = quantile(xsim, c(0.5))

    xsimpdf[i, ] = sm.density(xsim, eval.points = xeval, display = "none")$estimate
    Xsim[, i] = xsim
}


par(mfrow = c(1, 5))

zz = boxplot(meansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, mean(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "Mean")

zz = boxplot(mediansim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, quantile(xdata, c(0.5)), col = "red", cex = 1.25, pch = 16)
title(main = "Median")

zz = boxplot(sdsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, sd(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "SD")

zz = boxplot(skewsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, skew(xdata), col = "red", cex = 1.25, pch = 16)
title(main = "Skew")

zz = boxplot(iqrsim, xlab = "", ylab = "Mean", plot = F)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, xlab = "", ylab = "Mean", cex = 1.25)
points(z1, diff(quantile(xdata, c(0.25, 0.75))), col = "red", cex = 1.25, pch = 16)
title(main = "IQR")

plot of chunk unnamed-chunk-1


par(ask = TRUE)

# Boxplots of Simulated Statistics, their PDF and those of Original Data

par(mfrow = c(1, 1))
xs = 1:neval

zz = boxplot(split(t(xsimpdf), xs), plot = F, cex = 1)
zz$names = rep("", length(zz$names))
z1 = bxp(zz, ylim = range(xsimpdf, xdensityorig), xlab = "", ylab = "", cex = 1.25)

# Number of Points to Plot on Axis

npts = 10
n2 = round(neval * (1:npts)/npts)

z2 = z1[n2]
n1 = xeval[n2]

n1 = round(n1, dig = 2)
n1 = as.character(n1)

axis(1, at = z2, labels = n1, cex = 1)
lines(z1, xdensityorig, col = "red", lwd = 2)

title(main = "PDFs from the simulations and the historical data")

plot of chunk unnamed-chunk-1


# Problem 11. Linear Regression Modeling of the Upper Blue Nile Basin

# 11. a) Best Model Selection using Linear Regression

# Initializing the Vectors

y = as.matrix(scan("http://civil.colorado.edu/~balajir/CVEN5454/r-project-data/UBN_precip.txt"))
X = read.table("http://civil.colorado.edu/~balajir/CVEN5454/r-project-data/UBN_predictors.txt")
x1 = X[, 1]
x2 = X[, 2]
x3 = X[, 3]
x4 = X[, 4]
x5 = X[, 5]
x6 = X[, 6]
x7 = X[, 7]
xfull = cbind(x1, x2, x3, x4, x5, x6, x7)
x = xfull

N = length(y)

library(leaps)
library(MPV)
## Error: there is no package called 'MPV'
combs = leaps(x, y, nbest = N)
combos = combs$which
ncombos = length(combos[, 1])
xpress = 1:ncombos
xmse = 1:ncombos
for (i in 1:ncombos) {
    zz = glm(y ~ x[, combos[i, ]])
    xpress[i] = PRESS(zz)
    xmse[i] = sum((zz$res)^2)/(N - length(zz$coef))
}
## Error: could not find function "PRESS"

bestpress = order(xpress)[1]
bestmodelp = lsfit(x[, combos[bestpress, ]], y)
summary(bestmodelp)
##              Length Class  Mode   
## coefficients  2     -none- numeric
## residuals    40     -none- numeric
## intercept     1     -none- logical
## qr            6     qr     list
combos[bestpress, ]
##     1     2     3     4     5     6     7 
##  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
summary(zz)
## 
## Call:
## glm(formula = y ~ x[, combos[i, ]])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -168.51   -62.03    -1.84    41.01   150.30  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        429.64     110.75    3.88    4e-04 ***
## x[, combos[i, ]]    41.47       8.97    4.63  4.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 6841)
## 
##     Null deviance: 406296  on 39  degrees of freedom
## Residual deviance: 259955  on 38  degrees of freedom
## AIC: 470.7
## 
## Number of Fisher Scoring iterations: 2

## From our PRESS analysis, we have as our best model: y-hat = 919399 +
## 20.63*x1 - 1.46*x4 + 29.14*x5 + 9.50*x6

## Model Diagnostics ##

# Normality of errors
x11 = resid(bestmodelp)
qqnorm(x11)
qqline(x11)

plot of chunk unnamed-chunk-1


# Errors are normally distributed, showing good strength of the chosen
# best-fit model.

## Two-sample Kolmogorov-Smirnov test ##

ks.test(x11, "pnorm", mean(x11), sd(x11))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  x11
## D = 0.0799, p-value = 0.9425
## alternative hypothesis: two-sided

# Since p-value = 0.998, we conclude that the normality of the errors is
# reasonable.

# Lags on Residuals

acf(x11)

plot of chunk unnamed-chunk-1


# the errors are IID.

### Homoskedasticity ###

yhat = Y - x11
## Warning: longer object length is not a multiple of shorter object length
plot(yhat, x11, xlab = "Yhat", ylab = "residuals")
## Error: 'x' and 'y' lengths differ

# The residuals are randomly distributed with respect to predicted Y.
# Therefore, our hmoskedastic assumption is reasonable.

### R2
summary(lm(y ~ x))
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -127.80  -45.23   -7.35   42.65  126.09 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 9194.000   4836.175    1.90   0.0663 . 
## xx1           20.625     13.126    1.57   0.1259   
## xx2           22.154     23.053    0.96   0.3438   
## xx3            0.249      1.906    0.13   0.8968   
## xx4           -1.456      0.878   -1.66   0.1069   
## xx5           29.137     15.041    1.94   0.0616 . 
## xx6            9.502      2.926    3.25   0.0027 **
## xx7           -3.785      4.339   -0.87   0.3896   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69 on 32 degrees of freedom
## Multiple R-squared:  0.625,  Adjusted R-squared:  0.543 
## F-statistic: 7.62 on 7 and 32 DF,  p-value: 2.06e-05

# From our model diagnostics, above, we conclude that it is a reasonable
# model with which to continue.

# 11. b) Estimating the Accuracy and Precision of the Model using Dropped
# Points.

### These are the matrices of variables from the Best Model, above
X = as.matrix(cbind(x1, x4, x5, x6))
Y = as.matrix(yhat, byrow = T)

Np = round(0.1 * N)
index = 1:N
nsim = 500
rmse = c()
for (i in 1:nsim) {

    indexdrop = round(runif(Np, 1, N))  # select random 10% of points
    indexkeep = setdiff(index, indexdrop)

    Xp = X[indexdrop, ]

    Ykeep = Y[indexkeep, ]
    Xkeep = X[indexkeep]

    zz = lm(Ykeep ~ Xkeep, data = Xkeep)

    yp = predict(zz, as.data.frame(Xp))
    rmse1 = sum((Y[indexdrop] - yp)^2)/Np

    rmse = c(rmse, rmse1)

}
## Error: numeric 'envir' arg not of length one

## I was not able to run this simulation correctly. I canot comment on the
## skill of this model as compared to Rajagopalan and Block (2007)

# Problem 12. Upper Blue Nile Basin Logistic Regression Model

### Testing the Model using Logistic Regression on the 75th percentile.

N = length(y)
y = as.matrix(y)
x = as.matrix(xfull[, 1:7])
qy = quantile(y, 0.75)

### Insteading of automating, I created a new matrix by hand, of 1's and
### 0's

ybin = read.csv("C:/Users/Katita/Downloads/ybin.csv")
y = as.matrix(ybin)

## Running the Logistic Regression
x = X
logistic = glm(y ~ x, family = "binomial")
print(summary(logistic))
## 
## Call:
## glm(formula = y ~ x, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5711  -0.4905  -0.2916   0.0139   2.3140  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 338.1060   176.9859    1.91    0.056 .
## xx1           0.2675     0.3833    0.70    0.485  
## xx4          -0.0588     0.0304   -1.93    0.053 .
## xx5           1.1591     0.7137    1.62    0.104  
## xx6           0.3262     0.1353    2.41    0.016 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 44.987  on 39  degrees of freedom
## Residual deviance: 27.721  on 35  degrees of freedom
## AIC: 37.72
## 
## Number of Fisher Scoring iterations: 6
print(logistic$aic)
## [1] 37.72

## In computing the logistic regression using the limit of the 75th
## percentile of precipitation for the Dependent variable, the model loses
## some of its strength in variables x1 and x5, as we can see from their
## p-values.  In my view, the utility of the logistic regression is not as
## powerful, in this case, as that of the multi-regression in Problem 11,
## where we used multi-regression fitting and diagnostics to identify the
## most influential variables.

# Problem 13. Bayesian Regression to Model the Blue Nile Basin

# The Winbug Model The model:

winmodel = function() {
    for (i in 1:n) {
        y[i] ~ dnorm(mu[i], taue)
        mu[i] <- alpha + beta1 * x1[i] + beta2 * x2[i] + beta3 * x3[i] + beta4 * 
            x4[i] + beta5 * x5[i] + beta6 * x6[i] + beta7 * x7[i]
    }

    taue ~ dgamma(0.01, 0.1)
    alpha ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
    beta1 ~ dnorm(0, 0.01)
}
library(arm)
## Loading required package: Matrix
## Loading required package: lme4
## Attaching package: 'lme4'
## The following object is masked from 'package:stats':
## 
## AIC, BIC
## Loading required package: R2WinBUGS
## Loading required package: coda
## Attaching package: 'coda'
## The following object is masked from 'package:lme4':
## 
## HPDinterval
## Loading required package: boot
## Attaching package: 'boot'
## The following object is masked from 'package:gtools':
## 
## inv.logit, logit
## The following object is masked from 'package:lattice':
## 
## melanoma
## Loading required package: abind
## Loading required package: foreign
## arm (Version 1.6-05, built: 2013-3-8)
## Working directory is C:/Users/Katita/Downloads
## Attaching package: 'arm'
## The following object is masked from 'package:boot':
## 
## logit
## The following object is masked from 'package:coda':
## 
## traceplot
## The following object is masked from 'package:gtools':
## 
## logit
library(R2WinBUGS)

filename = file.path(tempdir(), "winmodel.bug")
write.model(winmodel, filename)

y = scan("http://civil.colorado.edu/~balajir/CVEN5454/r-project-data/UBN_precip.txt")
x = read.table("http://civil.colorado.edu/~balajir/CVEN5454/r-project-data/UBN_predictors.txt")
test = cbind(x, y)
colnames(test) = c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "y")

N = dim(test)[1]
data = list(n = N, x1 = test[, 1], x2 = test[, 2], x3 = test[, 3], x4 = test[, 
    4], x5 = test[, 5], x6 = test[, 6], x7 = test[, 7], y = test[, 8])
inits = list(alpha = 0, beta1 = 1, beta2 = 1, beta3 = 1, beta4 = 1, beta5 = 1, 
    beta6 = 1, beta7 = 1)
parameters = c("alpha", "beta1", "beta2", "beta3", "beta4", "beta5", "beta6", 
    "beta7", "taue")
linreg.sim = bugs(data, inits = NULL, , n.chains = 3, n.iter = 10000)
## Error: model.bug does not exist.
attach.bugs(linreg.sim)
## Error: object 'linreg.sim' not found
hist(alpha)
## Error: object 'alpha' not found
hist(beta)
## Error: 'x' must be numeric

### Posterior distributions are comparable to those found in Problem 11.
### However, Bayesian updating is a more powerful tool to predict values
### of the dependent variable, and I would look carefully at this model
### when comparing the diagnostics from each.