Waiting time between eruptions and the duration of the eruption for the Old Faithful geyser in Yellowstone National Park, Wyoming, USA.
## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
Energy output and surface termperature for Star Cluster CYG OB1. - logst: log survface termperature of the star. - logli: log light intensity of the star.
data("CYGOB1", package = "HSAUR")
head(CYGOB1)
## logst logli
## [1,] 4.37 5.23
## [2,] 4.56 5.74
## [3,] 4.26 4.93
## [4,] 4.56 5.74
## [5,] 4.30 5.19
## [6,] 4.46 5.46
ggplot(CYGOB1, aes(x=logst, y=logli)) + geom_point(size=3)
CYGOB1d <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1,dpik)) #Gaussian
CYGOB1d2 <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1,dpik, kernel = "box")) #box
CYGOB1d3 <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1,dpik,kernel = "epanech")) #centered beta(2,2) kernel
CYGOB1d4 <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1,dpik,kernel = "biweight")) #centred beta(3,3) density
CYGOB1d5 <- bkde2D(CYGOB1, bandwidth = sapply(CYGOB1,dpik,kernel = "biweight")) #centred beta(4,4)
contour(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat, xlab = "log surface temperature", ylab = "log light intensity")
contour(x = CYGOB1d2$x1, y = CYGOB1d2$x2, z = CYGOB1d2$fhat, xlab = "log surface temperature", ylab = "log light intensity")
contour(x = CYGOB1d3$x1, y = CYGOB1d3$x2, z = CYGOB1d3$fhat, xlab = "log surface temperature", ylab = "log light intensity")
contour(x = CYGOB1d4$x1, y = CYGOB1d4$x2, z = CYGOB1d4$fhat, xlab = "log surface temperature", ylab = "log light intensity")
contour(x = CYGOB1d5$x1, y = CYGOB1d5$x2, z = CYGOB1d5$fhat, xlab = "log surface temperature", ylab = "log light intensity") #2d contour plot
#3d contour
persp(x = CYGOB1d$x1, y = CYGOB1d$x2, z = CYGOB1d$fhat, xlab = "log surface temperature", ylab = "log light intensity",
zlab = "estimated density", theta = -35, axes = TRUE, box = TRUE, col = "yellow")
A Bayesian information criterion (BIC) is applied to choose the form of the mixture model:
mc <- Mclust(faithful$waiting)
mc
## 'Mclust' model object: (E,2)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "bic"
## [9] "loglik" "df" "hypvol" "parameters"
## [13] "z" "classification" "uncertainty"
and the estimated means are
mc$parameters$mean
## 1 2
## 54.61675 80.09239
with estimated standard deviation (found to be equal within both groups)
sqrt(mc$parameters$variance$sigmasq)
## [1] 5.868639
The proportion is p_hat = 0.36. The second package is called flexmix whose functionality is described by Leisch (2004). A mixture of two normals can be fitted using
fl <- flexmix(waiting ~ 1, data = faithful, k = 2)
parameters(fl, component = 1)
## Comp.1
## coef.(Intercept) 70.99948
## sigma 13.55887
parameters(fl, component = 2)
## Comp.2
## coef.(Intercept) 70.80601
## sigma 13.62634
We can get standard errors for the five parameter estimates by using a bootstrap approach (see Efron and Tibshirani, 1993). The original data are slightly perturbed by drawing n out of n observations with replacement and those artificial replications of the original data are called bootstrap samples. Now, we can fit the mixture for each bootstrap sample and assess the variability of the estimates, for example using confidence intervals. Some suitable R code based on the Mclust function follows. First, we define a function that, for a bootstrap sample indx, fits a two-component mixture model and returns p_hat and the estimated means (note that we need to make sure that we always get an estimate of p, not 1 − p):
logL <- function(param, x) {
d1 <- dnorm(x, mean = param[2], sd = param[3])
d2 <- dnorm(x, mean = param[4], sd = param[5])
-sum(log(param[1] * d1 + (1 - param[1]) * d2))
}
startparam <- c(p = 0.5, mu1 = 50, sd1 = 3, mu2 = 80, sd2 = 3)
opp <- optim(startparam, logL, x = faithful$waiting, method = "L-BFGS-B", lower = c(0.01, rep(1,4)), upper = c(0.99, rep(200, 4)))
fit <- function(x, indx) {
a <- Mclust(x[indx], minG = 2, maxG = 2)$parameters
if (a$pro[1] < 0.5)
return(c(p = a$pro[1], mu1 = a$mean[1], mu2 = a$mean[2]))
return(c(p = 1 - a$pro[1], mu1 = a$mean[2], mu2 = a$mean[1]))
}
opar <- as.list(opp$par)
rx <- seq(from = 40, to = 110, by = 0.1)
d1 <- dnorm(rx, mean = opar$mu1, sd = opar$sd1)
d2 <- dnorm(rx, mean = opar$mu2, sd = opar$sd2)
f <- opar$p * d1 + (1 - opar$p) * d2
hist(x, probability = TRUE, xlab = "Waiting times (in min.)", border = "gray", xlim = range(rx), ylim = c(0, 0.06), main = "")
lines(rx, f, lwd = 2)
lines(rx, dnorm(rx, mean = mean(x), sd = sd(x)), lty = 2, lwd = 2)
legend(50, 0.06, legend = c("Fitted two-component mixture density", "Fitted single normal density"), lty = 1:2, bty = "n")
The function fit can now be fed into the boot function for bootstrapping (here 1000 bootstrap samples are drawn)
bootpara <- boot(faithful$waiting, fit, R = 1000)
We assess the variability of our estimates p_hat by means of adjusted bootstrap percentile (BCa) confidence intervals, which for p_hat can be obtained from
boot.ci(bootpara, type = "bca", index = 1)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootpara, type = "bca", index = 1)
##
## Intervals :
## Level BCa
## 95% ( 0.2842, 0.4228 )
## Calculations and Intervals on Original Scale
We see that there is a reasonable variability in the mixture model, however, the means in the two components are rather stable, as can be seen from
boot.ci(bootpara, type = "bca", index = 2)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootpara, type = "bca", index = 2)
##
## Intervals :
## Level BCa
## 95% (52.78, 56.02 )
## Calculations and Intervals on Original Scale
for ˆµ1 and for ˆµ2 from
boot.ci(bootpara, type = "bca", index = 3)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = bootpara, type = "bca", index = 3)
##
## Intervals :
## Level BCa
## 95% (65.60, 80.98 )
## Calculations and Intervals on Original Scale
Finally, we show a graphical representation of both the bootstrap distribution of the mean estimates and the corresponding confidence intervals. For convenience, we define a function for plotting, namely
bootplot <- function(b, index, main = "") {
dens <- density(b$t[, index])
ci <- boot.ci(b, type = "bca", index = index)$bca[4:5]
est <- b$t0[index]
plot(dens, main = main)
y <- max(dens$y)/10
segments(ci[1], y, ci[2], y, lty = 2)
points(ci[1], y, pch = "(")
points(ci[2], y, pch = ")")
points(est, y, pch = 19)
}
layout(matrix(1:2, ncol = 2))
bootplot(bootpara, 2, main = expression(mu[1]))
bootplot(bootpara, 3, main = expression(mu[2]))