exam.R

user2 — Dec 8, 2013, 3:47 PM

#-----------------------------------------------------------------------------------------
# Exam Statistical Computing (in plain R)
#-----------------------------------------------------------------------------------------
set.seed(1)


#-----------------------------------------------------------------------------------------

# ========================
## Exercise 1
# ========================
data <- scan("illinois.csv")

#lik <- function(x, y) sum(dgamma(x=data, shape=x, scale=y, log=T))

# MLE for eta, so we calculate the log likelihood,
# do partial derivatives to eta and lambda, fill one into the other,
# and then we get a formula = 0. This one:
eta.mle <- function(eta, n, X) n*log(eta) - n*log(mean(X)) + sum(log(X)) - n*digamma(eta)

# We want this to be zero, so we use a bisection iteration method to find the proper eta:
# we find values around 0:
x <- seq(0.2, 0.8, 0.001)
plot(x, eta.mle(x, length(data), data))

plot of chunk unnamed-chunk-1

# so we see we can safely start with 0.1 and 0.9

calculate.estimate <- function(data) {
  # starting values:
  bounds <- c(0.1, 0.9)

  iterate.bisection <- function(m) {
    middle <- (m[1] + m[2]) / 2
    value <- eta.mle(middle, length(data), data)
    if(value < 0)
      return(c(m[1], middle))
    else
      return(c(middle, m[2]))
  }
  for(i in 1:100)
    bounds <- iterate.bisection(bounds)

  estimates <- data.frame(eta=mean(bounds))

  # And now lambda is simply mean(X) / eta:
  estimates$lambda <- estimates$eta / mean(data)
  return(estimates)
}
(estimates <- calculate.estimate(data)) # might be 1/lambda; rate vs scale

     eta lambda
1 0.4408  1.964


# Make a histogram:
hist(data, freq=F)
x <- seq(0, 3, 0.01)
par(new=F)
lines(x, dgamma(x, estimates$eta, estimates$lambda))

plot of chunk unnamed-chunk-1


## Now we estimate the standard error, assuming that the data is indeed gamma (otherwise this estimate is not valid)
t <- c()
for(i in 1:1000) {
  t <- rbind(t, calculate.estimate(rgamma(length(data), estimates$eta, estimates$lambda)))
}

# SE:
se <- list(lambda=sqrt(mean((t$lambda - estimates$lambda)^2)))
se$eta <- sqrt(mean((t$eta - estimates$eta)^2))
se

$lambda
[1] 0.2604

$eta
[1] 0.03434


# Correlation coefficients:
cor(t)

          eta lambda
eta    1.0000 0.6122
lambda 0.6122 1.0000

cor(t)[[1,2]]

[1] 0.6122


# Contour plot:
density <- function(eta, lambda) {
  sum(dgamma(data, eta, lambda, log=T))
}

x <- seq( estimates$eta - 2 * se$eta , estimates$eta + 2 * se$eta,0.01)
y <- seq( estimates$lambda - 2 * se$lambda , estimates$lambda + 2 * se$lambda,0.03)
z <- outer(x,y,Vectorize(density))
#persp(x,y,z, theta=25, phi=25)

d1 <- qchisq(0.95, 2)
d2 <- qchisq(0.99, 2)
max.ll <- max(z)
level1 <- max.ll - d1 / 2
level2 <- max.ll - d2 / 2
contour(x,y,z, levels=c(150, 175, level1, level2, 185))

plot of chunk unnamed-chunk-1


### COMMENT:
# The correlation coefficient is a bit lower than 1. We can see this in the contour plot 
# (that it's positive and near 1) by recognizing that as one axis goes up, the other does so too.


#-----------------------------------------------------------------------------------------

# ========================
# Exercise 2
# ========================
require(boot)

Loading required package: boot


quant.func <- function(data, sample) {
  sample.data <- data[sample]  # Select the proper subset
  sort(sample.data)[0.95*length(sample.data)]
}

# Parametric estimate:
gamma.data <- rgamma(227, estimates$eta, estimates$lambda)
param <- boot(gamma.data, quant.func, R=1000)
# Fact for our parametric estimate:
(quant <- qgamma(0.95, estimates$eta, estimates$lambda))

[1] 0.9013

# SE for our estimates: (as asked in question, for Estimated minus Truth)
estimate.se <- sqrt(mean((param$t - quant)^2))
parametric.estimate <- mean(param$t)
# So confidence interval:
c(parametric.estimate - 1.96 * estimate.se, parametric.estimate + 1.96 * estimate.se)

[1] 0.6925 1.0581


# For our nonparametric estimate we know nothing except the calculated values, which are nonparam$t:
nonparam <- boot(data, quant.func, R=1000)
nonparam.estimate <- mean(nonparam$t)
# Same here as above: as asked in question, Estimated minus Truth
estimate.se2 <- sqrt(mean((nonparam$t - quant)^2))
# So for the nonparametric version we find this confidence interval:
c(nonparam.estimate - 1.96 * estimate.se2, nonparam.estimate + 1.96 * estimate.se2)

[1] 0.621 1.287



# In this interpretation of the question, we see that the estimate.se for the parametric method
# is slightly lower than the estimate.se2 for the nonparametric method. This should be expected,
# since with the nonparametric method we are not certain that the data is really from the exponential
# distribution.

#-----------------------------------------------------------------------------------------

# ========================
# Exercise 3
# ========================
## The direct way would be:
n <- 100000
samp <- split(rnorm(n*10), 1:n)
t <- sapply(samp, max)
out1 <- (t * (t>4))
mean(out1)

[1] 0.001683

sd(out1)/ sqrt(n)

[1] 0.0002665


## A more efficient way would be to sample more above t=4:
## We still like to keep the distribution similar though, so we 
## choose sd=2 instead:
samp2 <- split(rnorm(n*10, sd=2), 1:n)
t <- sapply(samp2, max)
dens_new <- sapply(samp2, function(x) sum(dnorm(x, sd=2, log=T)))
dens_old <- sapply(samp2, function(x) sum(dnorm(x, log=T)))

out2 <- (t * (t>4) * exp(dens_old - dens_new))
mean(out2)

[1] 0.001337

sd(out2) / sqrt(n)

[1] 8.74e-05


# We see that this estimate is far better, in terms of SD. 

#-----------------------------------------------------------------------------------------

# ========================
## Exercise 4
# ========================
survival <- read.csv("survival.csv", head=F)
names(survival) <- c("time","delta")
# delta is 1 when the observation is not censored, so when time = T1

lambda <- 0.1
lambda.step <- function(lambda, data, n) {
  1 / mean(data$time + (1-data$delta)/lambda) # <-- derived on paper (and also in question, I see)
}
for(i in 1:20)
  print(lambda <- lambda.step(lambda, survival, length(survival$delta)))

[1] 0.1602
[1] 0.2486
[1] 0.3685
[1] 0.5151
[1] 0.673
[1] 0.8216
[1] 0.9447
[1] 1.036
[1] 1.099
[1] 1.14
[1] 1.166
[1] 1.181
[1] 1.191
[1] 1.196
[1] 1.2
[1] 1.202
[1] 1.203
[1] 1.203
[1] 1.204
[1] 1.204



# Likelihood:
# For observed values, this is dexp(time, lambda). For unobserved values, this is the entire density right of
# te observed time, so the integral over that part, so 1-CDF(time, lambda):
lik <- function(x) prod(survival$delta * dexp(survival$time, x) + (1-survival$delta)*(1-pexp(survival$time, x)))
x <- seq(0, 3, 0.001)
y <- Vectorize(lik)(x)
plot(x,y, type="l")

plot of chunk unnamed-chunk-1

(mle.lambda <- x[which(y==max(y))])

[1] 1.204


# So we find that lambda from EM and this MLE are close:
lambda

[1] 1.204

mle.lambda

[1] 1.204

# which makes us happy.

#-----------------------------------------------------------------------------------------