How to calculate the prediction interval for LMM

Using ggpredict function

library(nlme)
library(ggeffects)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks nlme::collapse()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

m <- fm2 <- lme(distance ~ age + Sex, data = Orthodont, random = ~ 1|Subject) 

ggpredict(m, "age",interval = "confidence") %>% plot(rawdata = T, dot.alpha = 0.2)

ggpredict(m, "age", type = "re",interval = "prediction") %>% plot(rawdata = T, dot.alpha = 0.2)

outcome1=ggpredict(m, terms = "age [15:40]")
head(outcome1)

## # Predicted values of distance
## 
## age | Predicted |         95% CI
## --------------------------------
##  15 |     27.61 | [26.54, 28.68]
##  16 |     28.27 | [27.14, 29.40]
##  17 |     28.93 | [27.73, 30.13]
##  18 |     29.59 | [28.32, 30.86]
##  19 |     30.25 | [28.89, 31.61]
##  20 |     30.91 | [29.47, 32.36]
## 
## Adjusted for:
## *     Sex = Male
## * Subject = 0 (population-level)

outcome2=ggpredict(m, "age", type = "re",interval = "prediction")
head(outcome2)

## # Predicted values of distance
## 
## age | Predicted |         95% CI
## --------------------------------
##   8 |     22.99 | [20.00, 25.97]
##  10 |     24.31 | [21.34, 27.27]
##  12 |     25.63 | [22.66, 28.59]
##  14 |     26.95 | [23.96, 29.93]
## 
## Adjusted for:
## *     Sex = Male
## * Subject = 0 (population-level)

## 
## Intervals are prediction intervals.

# read ggpredict docment

Using predictInterval function

# Lsmeans gives a Confidence Interval, and predictInterval gives a prediction interval. 

library(lsmeans)

## Loading required package: emmeans

## The 'lsmeans' package is now basically a front end for 'emmeans'.
## Users are encouraged to switch the rest of the way.
## See help('transition') for more information, including how to
## convert old 'lsmeans' objects and scripts to work with 'emmeans'.

library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'lme4'

## The following object is masked from 'package:nlme':
## 
##     lmList

library(nlme)

# fit model
n1 <- lmer(distance ~ age + Sex+(1|Subject), data = Orthodont ) 
n2 <-  lme(distance ~ age + Sex, data = Orthodont, random = ~ 1|Subject) 

summary(n1)

## Linear mixed model fit by REML ['lmerMod']
## Formula: distance ~ age + Sex + (1 | Subject)
##    Data: Orthodont
## 
## REML criterion at convergence: 437.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7489 -0.5503 -0.0252  0.4534  3.6575 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  Subject  (Intercept) 3.267    1.807   
##  Residual             2.049    1.432   
## Number of obs: 108, groups:  Subject, 27
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 17.70671    0.83392  21.233
## age          0.66019    0.06161  10.716
## SexFemale   -2.32102    0.76142  -3.048
## 
## Correlation of Fixed Effects:
##           (Intr) age   
## age       -0.813       
## SexFemale -0.372  0.000

summary(n2)

## Linear mixed-effects model fit by REML
##   Data: Orthodont 
##        AIC      BIC    logLik
##   447.5125 460.7823 -218.7563
## 
## Random effects:
##  Formula: ~1 | Subject
##         (Intercept) Residual
## StdDev:    1.807425 1.431592
## 
## Fixed effects:  distance ~ age + Sex 
##                 Value Std.Error DF   t-value p-value
## (Intercept) 17.706713 0.8339225 80 21.233044  0.0000
## age          0.660185 0.0616059 80 10.716263  0.0000
## SexFemale   -2.321023 0.7614168 25 -3.048294  0.0054
##  Correlation: 
##           (Intr) age   
## age       -0.813       
## SexFemale -0.372  0.000
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.74889609 -0.55034466 -0.02516628  0.45341781  3.65746539 
## 
## Number of Observations: 108
## Number of Groups: 27

# lsmeans, confidence interval 
lsmeans(n1,   list (pairwise ~ Sex ), at = list(age=11))

## $`lsmeans of Sex`
##  Sex    lsmean    SE df lower.CL upper.CL
##  Male     25.0 0.486 25     24.0     26.0
##  Female   22.6 0.586 25     21.4     23.9
## 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95 
## 
## $`pairwise differences of Sex`
##  1             estimate    SE df t.ratio p.value
##  Male - Female     2.32 0.761 25   3.048  0.0054
## 
## Degrees-of-freedom method: kenward-roger

lsmeans(n2,   list (pairwise ~ Sex ), at = list(age=11))

## $`lsmeans of Sex`
##  Sex    lsmean    SE df lower.CL upper.CL
##  Male     25.0 0.486 26     24.0     26.0
##  Female   22.6 0.586 25     21.4     23.9
## 
## Degrees-of-freedom method: containment 
## Confidence level used: 0.95 
## 
## $`pairwise differences of Sex`
##  1             estimate    SE df t.ratio p.value
##  Male - Female     2.32 0.761 25   3.048  0.0054
## 
## Degrees-of-freedom method: containment

lsmeans(n1,   "age")

##  age lsmean    SE df lower.CL upper.CL
##   11   23.8 0.381 25       23     24.6
## 
## Results are averaged over the levels of: Sex 
## Degrees-of-freedom method: kenward-roger 
## Confidence level used: 0.95

# first method: predictInterval
library(merTools)

## Loading required package: arm

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## arm (Version 1.13-1, built: 2022-8-25)

## Working directory is C:/Users/hed2/Downloads/code-storage/code

new.data <- data.frame(age=11, Subject='M01',Sex="Male")
pI <- predictInterval(n1,newdata=new.data, which='fixed',level=0.95)
pI  # no applicable method for 'getME' applied to an object of class "lme"

##        fit      upr      lwr
## 1 24.98264 27.99309 21.80242

# 1 predictInterval, using the original data xs, then aggregate, the overall effect
pI <- predictInterval(n1,newdata=Orthodont ,level=0.95)
pred.data <- data.frame(Orthodont,pred=pI[1],pred_low=pI[3],pred_up=pI[2])
mean((pred.data$distance-pred.data$fit)^2)

## [1] 1.606495

# 2 fixed effect
pI <- predictInterval(n1,newdata=Orthodont, which='fixed',level=0.95)
pred.data <- data.frame(Orthodont,pred=pI[1],pred_low=pI[3],pred_up=pI[2])
mean((pred.data$distance-pred.data$fit)^2)

## [1] 4.99668

# 3 random effect 
pI <- predictInterval(n1,newdata=Orthodont, which='random',level=0.95)
pred.data <- data.frame(Orthodont,pred=pI[1],pred_low=pI[3],pred_up=pI[2])
mean((pred.data$distance-pred.data$fit)^2)

## [1] 581.5902

############## random and fixed effects
m1 <- lmer(Reaction ~ Days + (1 | Subject), sleepstudy)

intFit <- predictInterval(m1, newdata = sleepstudy ) # bounded values
head(intFit)

##        fit      upr      lwr
## 1 292.7711 336.8842 248.9802
## 2 301.9809 343.8089 259.6131
## 3 312.2498 356.2606 269.4721
## 4 322.0792 369.1102 281.0829
## 5 334.5273 379.8743 292.9503
## 6 343.9391 386.7206 299.5938

intFit2 <- predictInterval(m1, newdata = sleepstudy ,which = "random") # bounded values
head(intFit2)

##        fit      upr        lwr
## 1 43.02962 83.64611  0.2887879
## 2 41.02894 84.01380 -2.7855840
## 3 40.22276 86.12611 -1.7813414
## 4 41.65719 84.93009 -0.2718662
## 5 40.52752 84.09293 -1.2122379
## 6 41.03242 81.07262 -0.1648371

intFit3 <- predictInterval(m1, newdata = sleepstudy ,which = "fixed") # bounded values
head(intFit3)

##        fit      upr      lwr
## 1 252.0878 295.3780 210.3195
## 2 262.1760 302.6446 218.9889
## 3 271.7278 316.3573 227.9273
## 4 282.7686 326.1952 240.9669
## 5 290.3763 333.8694 251.8883
## 6 302.5332 345.1220 261.0191

Using Simulation

# second method: Simulation. This has result similar to predictInterval
sim <- simulate(n1, nsim = 1, newdata=new.data) #based on distribution with random error, like MCMC

s <- numeric()

for (i in 1:500) {
  s[i] <- simulate(n1, nsim = 1, newdata=new.data)[1]
}
s <- unlist(s)
hist(s)

quantile(s,0.025)

##     2.5% 
## 20.73304

quantile(s,0.975)

##    97.5% 
## 29.30722

# third method:
#   calculate mean and std, then calculate percentile by using normal distribution

intFit4 <- predictInterval(n1, newdata = new.data,level=0.95 ) # bounded values
head(intFit4) #using the overall effect

##        fit      upr      lwr
## 1 27.41754 30.88217 24.18202

For general linear regression

library(ggplot2)
library(nlraa)
library(nlme)
library(mgcv)

## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.

data(Oats, package = "nlme")
## A subset for simplicity
Oats.I <- subset(Oats, subset = Block == "I")
plot(Oats.I)

# fitting a linear model
fm1 <- lm(yield ~ nitro, data = Oats.I)
fm1.prd <- predict(fm1, interval = "conf")
Oats.IA <- cbind(Oats.I, fm1.prd)

## Make a plot
ggplot(data = Oats.IA, aes(x = nitro, y = yield)) + 
  geom_point() + 
  geom_line(aes(y = fit)) + 
  geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "purple", alpha = 0.4)

# bootstrapped confidence bands
fm1.boot <- boot_lm(fm1, fitted, R=200)  #randomly sampling data 
fm1.boot.prd <- summary_simulate(t(fm1.boot$t))
Oats.IAB <- cbind(Oats.I, fm1.boot.prd)

## Make a plot
ggplot(data = Oats.IAB, aes(x = nitro, y = yield)) + 
  geom_ribbon(aes(ymin = Q2.5, ymax = Q97.5), fill = "deepskyblue", alpha = 0.6) + 
  geom_line(aes(y = Estimate), color = "white", size = 1.5) +
  # geom_point() + 
  ggtitle("95% Bootstrapped Confidence Bands")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

For nonlinear

library(boot)

## 
## Attaching package: 'boot'

## The following object is masked from 'package:arm':
## 
##     logit

# Nonlinear Example: Loblolly, then using bootstrap or simulation  
Lob <- subset(Loblolly, Seed %in% c("301", "303", "305", "307", "309"))

fnlm1 <- nls(height ~ SSasymp(age, Asym, R0, lrc), data = Lob)

# bootstrapped confidence bands
fm1.boot <- boot(fnlm1, fitted, R=200)  #randomly sampling data 
fm1.boot.prd <- summary_simulate(t(fm1.boot$t))
Oats.IAB <- cbind(Lob, fm1.boot.prd)

## Plot of observed and fitted
## Make a plot
ggplot(data = Oats.IAB, aes(x = age, y = height)) + 
  geom_line(aes(y = Estimate), color = "white", size = 0.6) +
  # geom_point() + 
    geom_ribbon(aes(ymin = Q2.5, ymax = Q97.5), fill = "black", alpha = 1.5) +
  ggtitle("95% Bootstrapped Confidence Bands")

Bootstrapping uses the original, initial sample as the population from which to resample, whereas Monte Carlo simulation is based on setting up a data generation process (parameters).

How to calculate the prediction interval for LMM

2023-05-12

How to calculate the prediction interval for LMM