Difference between confidence and prediction intervals

When plotting results of your model, you can report “intervals”. There are two types of them. Confidence interval tells you something about the estimated parameters, while the prediction interval tells you where the next future observation is likely to fall (given the tolerance level, often as 95%). See also this discussion on Stackoverflow and R Help mailing list.

Here we try to demonstrate this with a simple example using linear regression.

set.seed(357)
library(ggplot2) # for ggplot()
library(gridExtra) # for grid.arrange() at the bottom

## Loading required package: grid


x <- rnorm(20)
y <- x * rnorm(20, mean = 3, sd = 1)
xy <- data.frame(x, y)

mdl <- lm(y ~ x, data = xy)

# Predict these data for...
predx <- data.frame(x = seq(from = -2, to = 3, by = 0.1))

# ... confidence interval
conf.int <- cbind(predx, predict(mdl, newdata = predx, interval = "confidence", level = 0.95))

# ... prediction interval
pred.int <- cbind(predx, predict(mdl, newdata = predx, interval = "prediction", level = 0.95))
man <- predict(mdl, newdata = predx, se = TRUE)

# Manual calculation of confidence interval, tolerance of 0.95 (1.96).
lvl <- qt(1-(1 - 0.95)/2, mdl$df.residual) # Thank you, Roland (http://chat.stackoverflow.com/transcript/message/10581408#10581408)
conf.int.man <- cbind(predx, fit = man$fit, lwr = man$fit - lvl * man$se.fit, upr = man$fit + lvl * man$se.fit)


# Manual calculation of prediction interval, tolerance level same as above.
# We will need to calculate the variance:
MSQ <- anova(mdl)[2, 3]
xdash <- predx$x - mean(xy$x)
SS <- sum((xy$x - mean(xy$x))^2)
n <- nrow(xy)
my.var <- MSQ * (1 + (1/n) + xdash^2/SS)

pred.int.man <- cbind(predx, fit = man$fit, lwr = man$fit - lvl * sqrt(my.var), upr = man$fit + lvl * sqrt(my.var))

# Plot each CI individually.
g.conf <- ggplot(conf.int, aes(x = x, y = fit)) +
  theme_bw() +
  ggtitle("Confidence interval of estimated parameters from predict()") +
  geom_point(data = xy, aes(x = x, y = y)) +
  geom_smooth(data = conf.int, aes(ymin = lwr, ymax = upr), stat = "identity") 

g.pred <- ggplot(pred.int, aes(x = x, y = fit)) +
  theme_bw() +
  ggtitle("Prediction interval for future observations from predict()") +
  geom_point(data = xy, aes(x = x, y = y)) +
  geom_smooth(data = pred.int, aes(ymin = lwr, ymax = upr), stat = "identity")

g.conf.man <- ggplot(conf.int, aes(x = x, y = fit)) +
  theme_bw() +
  ggtitle("Manually calculated 95% confidence interval of parameters") +
  geom_point(data = xy, aes(x = x, y = y)) +
  geom_smooth(data = conf.int.man, aes(ymin = lwr, ymax = upr), stat = "identity")

g.pred.man <- ggplot(conf.int.man, aes(x = x, y = fit)) +
  theme_bw() +
  ggtitle("Manually calculated 95% prediction interval for future observations") +
  geom_point(data = xy, aes(x = x, y = y)) +
  geom_smooth(data = pred.int.man, aes(ymin = lwr, ymax = upr), stat = "identity")

grid.arrange(g.conf, g.conf.man, g.pred, g.pred.man, ncol = 2)

plot of chunk plotindividual