user2 — Dec 8, 2013, 3:47 PM
#-----------------------------------------------------------------------------------------
# Exam Statistical Computing (in plain R)
#-----------------------------------------------------------------------------------------
set.seed(1)
#-----------------------------------------------------------------------------------------
# ========================
## Exercise 1
# ========================
data <- scan("illinois.csv")
#lik <- function(x, y) sum(dgamma(x=data, shape=x, scale=y, log=T))
# MLE for eta, so we calculate the log likelihood,
# do partial derivatives to eta and lambda, fill one into the other,
# and then we get a formula = 0. This one:
eta.mle <- function(eta, n, X) n*log(eta) - n*log(mean(X)) + sum(log(X)) - n*digamma(eta)
# We want this to be zero, so we use a bisection iteration method to find the proper eta:
# we find values around 0:
x <- seq(0.2, 0.8, 0.001)
plot(x, eta.mle(x, length(data), data))
# so we see we can safely start with 0.1 and 0.9
calculate.estimate <- function(data) {
# starting values:
bounds <- c(0.1, 0.9)
iterate.bisection <- function(m) {
middle <- (m[1] + m[2]) / 2
value <- eta.mle(middle, length(data), data)
if(value < 0)
return(c(m[1], middle))
else
return(c(middle, m[2]))
}
for(i in 1:100)
bounds <- iterate.bisection(bounds)
estimates <- data.frame(eta=mean(bounds))
# And now lambda is simply mean(X) / eta:
estimates$lambda <- estimates$eta / mean(data)
return(estimates)
}
(estimates <- calculate.estimate(data)) # might be 1/lambda; rate vs scale
eta lambda
1 0.4408 1.964
# Make a histogram:
hist(data, freq=F)
x <- seq(0, 3, 0.01)
par(new=F)
lines(x, dgamma(x, estimates$eta, estimates$lambda))
## Now we estimate the standard error, assuming that the data is indeed gamma (otherwise this estimate is not valid)
t <- c()
for(i in 1:1000) {
t <- rbind(t, calculate.estimate(rgamma(length(data), estimates$eta, estimates$lambda)))
}
# SE:
se <- list(lambda=sqrt(mean((t$lambda - estimates$lambda)^2)))
se$eta <- sqrt(mean((t$eta - estimates$eta)^2))
se
$lambda
[1] 0.2604
$eta
[1] 0.03434
# Correlation coefficients:
cor(t)
eta lambda
eta 1.0000 0.6122
lambda 0.6122 1.0000
cor(t)[[1,2]]
[1] 0.6122
# Contour plot:
density <- function(eta, lambda) {
sum(dgamma(data, eta, lambda, log=T))
}
x <- seq( estimates$eta - 2 * se$eta , estimates$eta + 2 * se$eta,0.01)
y <- seq( estimates$lambda - 2 * se$lambda , estimates$lambda + 2 * se$lambda,0.03)
z <- outer(x,y,Vectorize(density))
#persp(x,y,z, theta=25, phi=25)
d1 <- qchisq(0.95, 2)
d2 <- qchisq(0.99, 2)
max.ll <- max(z)
level1 <- max.ll - d1 / 2
level2 <- max.ll - d2 / 2
contour(x,y,z, levels=c(150, 175, level1, level2, 185))
### COMMENT:
# The correlation coefficient is a bit lower than 1. We can see this in the contour plot
# (that it's positive and near 1) by recognizing that as one axis goes up, the other does so too.
#-----------------------------------------------------------------------------------------
# ========================
# Exercise 2
# ========================
require(boot)
Loading required package: boot
quant.func <- function(data, sample) {
sample.data <- data[sample] # Select the proper subset
sort(sample.data)[0.95*length(sample.data)]
}
# Parametric estimate:
gamma.data <- rgamma(227, estimates$eta, estimates$lambda)
param <- boot(gamma.data, quant.func, R=1000)
# Fact for our parametric estimate:
(quant <- qgamma(0.95, estimates$eta, estimates$lambda))
[1] 0.9013
# SE for our estimates: (as asked in question, for Estimated minus Truth)
estimate.se <- sqrt(mean((param$t - quant)^2))
parametric.estimate <- mean(param$t)
# So confidence interval:
c(parametric.estimate - 1.96 * estimate.se, parametric.estimate + 1.96 * estimate.se)
[1] 0.6925 1.0581
# For our nonparametric estimate we know nothing except the calculated values, which are nonparam$t:
nonparam <- boot(data, quant.func, R=1000)
nonparam.estimate <- mean(nonparam$t)
# Same here as above: as asked in question, Estimated minus Truth
estimate.se2 <- sqrt(mean((nonparam$t - quant)^2))
# So for the nonparametric version we find this confidence interval:
c(nonparam.estimate - 1.96 * estimate.se2, nonparam.estimate + 1.96 * estimate.se2)
[1] 0.621 1.287
# In this interpretation of the question, we see that the estimate.se for the parametric method
# is slightly lower than the estimate.se2 for the nonparametric method. This should be expected,
# since with the nonparametric method we are not certain that the data is really from the exponential
# distribution.
#-----------------------------------------------------------------------------------------
# ========================
# Exercise 3
# ========================
## The direct way would be:
n <- 100000
samp <- split(rnorm(n*10), 1:n)
t <- sapply(samp, max)
out1 <- (t * (t>4))
mean(out1)
[1] 0.001683
sd(out1)/ sqrt(n)
[1] 0.0002665
## A more efficient way would be to sample more above t=4:
## We still like to keep the distribution similar though, so we
## choose sd=2 instead:
samp2 <- split(rnorm(n*10, sd=2), 1:n)
t <- sapply(samp2, max)
dens_new <- sapply(samp2, function(x) sum(dnorm(x, sd=2, log=T)))
dens_old <- sapply(samp2, function(x) sum(dnorm(x, log=T)))
out2 <- (t * (t>4) * exp(dens_old - dens_new))
mean(out2)
[1] 0.001337
sd(out2) / sqrt(n)
[1] 8.74e-05
# We see that this estimate is far better, in terms of SD.
#-----------------------------------------------------------------------------------------
# ========================
## Exercise 4
# ========================
survival <- read.csv("survival.csv", head=F)
names(survival) <- c("time","delta")
# delta is 1 when the observation is not censored, so when time = T1
lambda <- 0.1
lambda.step <- function(lambda, data, n) {
1 / mean(data$time + (1-data$delta)/lambda) # <-- derived on paper (and also in question, I see)
}
for(i in 1:20)
print(lambda <- lambda.step(lambda, survival, length(survival$delta)))
[1] 0.1602
[1] 0.2486
[1] 0.3685
[1] 0.5151
[1] 0.673
[1] 0.8216
[1] 0.9447
[1] 1.036
[1] 1.099
[1] 1.14
[1] 1.166
[1] 1.181
[1] 1.191
[1] 1.196
[1] 1.2
[1] 1.202
[1] 1.203
[1] 1.203
[1] 1.204
[1] 1.204
# Likelihood:
# For observed values, this is dexp(time, lambda). For unobserved values, this is the entire density right of
# te observed time, so the integral over that part, so 1-CDF(time, lambda):
lik <- function(x) prod(survival$delta * dexp(survival$time, x) + (1-survival$delta)*(1-pexp(survival$time, x)))
x <- seq(0, 3, 0.001)
y <- Vectorize(lik)(x)
plot(x,y, type="l")
(mle.lambda <- x[which(y==max(y))])
[1] 1.204
# So we find that lambda from EM and this MLE are close:
lambda
[1] 1.204
mle.lambda
[1] 1.204
# which makes us happy.
#-----------------------------------------------------------------------------------------