Variance Reduction Using Importance Sampling

Importance sampling is an approach to variance reduction that can improve efficiency relative to the simple Monte Carlo method and can be applied to estimates over an unbounded interval. The basic idea underlying importance sampling is that certain values of the input random variables in a Monte Carlo simulation will contribute more to the estimate of our parameter of interest than others, and as such, if we sample from a distribution that overweights these important values, we can reduce variance in our estimator. Since we are sampling from a non-uniform distribution with a given density or importance function, an unweighted sample mean would be a biased estimator; we can produce an unbiased estimator by taking a weighted average. We can reduce the variance of our estimator by choosing an importance sampling function that is easy to simulate, has finite variance, and whose density more closely resembles that of the function for which we are estimating some parameter of interest. In other words, we try to choose \(\phi(x)\) such that \(\phi(x) \simeq |g(x)|f(x)\) on the set A where \(f(x)\) is supported.

Example 5.10

\(g(x) = \int_{0}^{1} \frac{e^{-x}}{1 + x^2} dx\)

Estimation by simple Monte Carlo integration

m <- 10000
x <- runif(m)
g <- function(x) {exp(-x) / (1 + x^2)}
(mci_est <- mean(g(x)))
## [1] 0.5293148
(mci_var <- var(g(x)))
## [1] 0.06051204
(exact <- integrate(g, 0, 1))
## 0.5247971 with absolute error < 5.8e-15

Estimation and variance across importance functions

\(f_0(x) = 1, 0 < x < 1\)

\(f_1(x) = e^{-x}, 0 < x < \infty\)

\(f_2(x) = (1 + x^2)^{-1} / \pi, -\infty < x < \infty\)

\(f_3(x) = e^{-x} / (1 - e^{-1}), 0 < x < 1\)

\(f_4(x) = 4(1 + x^2)^{-1} / \pi, 0 < x < 1\)

estimate <- data.frame(importance_function = character(5), 
                       theta_hat = numeric(5), 
                       variance = numeric(5), 
                       stringsAsFactors = FALSE)

g <- function(x) {exp(-x) / (1 + x^2) * (x > 0) * (x < 1)}
f0 <- function(x) 1
f1 <- function(x) exp(-x)
f2 <- function(x) (1 / pi) / (1 + x^2)
f3 <- function(x) exp(-x) / (1 - exp(-1))
f4 <- function(x) 4 / ((1 + x^2) * pi)

#f0
x <- runif(m)
fg <- g(x) / f0(x)
estimate$importance_function[1] <- "f0"
estimate$theta_hat[1] <- mean(fg)
estimate$variance[1] <- var(fg)

#f1
x <- rexp(m, 1)
fg <- g(x) / f1(x)
estimate$importance_function[2] <- "f1"
estimate$theta_hat[2] <- mean(fg)
estimate$variance[2] <- var(fg)

#f2
x <- rcauchy(m)
i <- c(which(x > 1), which(x < 0))
x[i] <- 2
fg <- g(x) / f2(x)
estimate$importance_function[3] <- "f2"
estimate$theta_hat[3] <- mean(fg)
estimate$variance[3] <- var(fg)

#f3
u <- runif(m)
x <- - log(1 - u * (1 - exp(-1)))
fg <- g(x) / f3(x)
estimate$importance_function[4] <- "f3"
estimate$theta_hat[4] <- mean(fg)
estimate$variance[4] <- var(fg)

#f4
u <- runif(m)
x <- tan(pi * u / 4)
fg <- g(x) / f4(x)
estimate$importance_function[5] <- "f5"
estimate$theta_hat[5] <- mean(fg)
estimate$variance[5] <- var(fg)

knitr::kable(estimate)
importance_function theta_hat variance
f0 0.5223684 0.0598330
f1 0.5270698 0.1757460
f2 0.5207145 0.9008786
f3 0.5228689 0.0095242
f5 0.5223731 0.0197161
library(ggplot2)

ggplot(data.frame(x = seq(0.01, 0.99, 0.01)), aes(x)) +
  stat_function(fun = g, aes(linetype = "g", color = "g")) +
  stat_function(fun = f0, aes(linetype = "f0", color = "f0")) +
  stat_function(fun = f1, aes(linetype = "f1", color = "f1")) +
  stat_function(fun = f2, aes(linetype = "f2", color = "f2")) +
  stat_function(fun = f3, aes(linetype = "f3", color = "f3")) +
  stat_function(fun = f4, aes(linetype = "f4", color = "f4")) +
  scale_linetype_manual(name = "", 
                        values = c("g" = "solid",
                                   "f0" = "dashed",
                                   "f1" = "1F",
                                   "f2" = "dotted",
                                   "f3" = "dotdash",
                                   "f4" = "longdash"),
                        breaks = c("g", "f0", "f1", "f2", "f3", "f4")) +
  scale_color_manual(name = "", 
                     values = c("g" = "red",
                                "f0" = "black",
                                "f1" = "black",
                                "f2" = "black",
                                "f3" = "black",
                                "f4" = "black"),
                     breaks = c("g", "f0", "f1", "f2", "f3", "f4")) +
  ylab("f(x)")

ggplot(data.frame(x = seq(0.01, 0.99, 0.01)), aes(x)) +
  stat_function(fun = function(x) g(x) / f0(x), aes(linetype = "f0", color = "f0")) +
  stat_function(fun = function(x) g(x) / f1(x), aes(linetype = "f1", color = "f1")) +
  stat_function(fun = function(x) g(x) / f2(x), aes(linetype = "f2", color = "f2")) +
  stat_function(fun = function(x) g(x) / f3(x), aes(linetype = "f3", color = "f3")) +
  stat_function(fun = function(x) g(x) / f4(x), aes(linetype = "f4", color = "f4")) +
  scale_linetype_manual(name = "", 
                        values = c("f0" = "dashed",
                                   "f1" = "1F",
                                   "f2" = "dotted",
                                   "f3" = "dotdash",
                                   "f4" = "longdash"),
                        breaks = c("f0", "f1", "f2", "f3", "f4")) +
  scale_color_manual(name = "", 
                     values = c("f0" = "black",
                                "f1" = "black",
                                "f2" = "black",
                                "f3" = "black",
                                "f4" = "black"),
                     breaks = c("f0", "f1", "f2", "f3", "f4")) +
  ylab("g(x) / f(x)")

Example 5.3, the standard normal cdf

\(\Phi(x) = \int_{-\infty}^{x} \frac{1}{\sqrt{2 \pi}} e^{-t^2 / 2} dt\)

Substitute \(y = t/x\), \(dt = xdy\)

\(\theta = \int_{0}^{1} xe^{-(xy)^2 / 2} dy\)

\(\theta = E_Y[xe^{-(xY)^2/2}]\), for \(Y \sim U(0, 1)\)

x <- seq(0.1, 2.5, length = 10)
m <- 10000
u <- runif(m)
cdf <- numeric(length(x))

for (i in 1:length(x)) {
  g <- x[i] * exp(-(u * x[i])^2 / 2)
  cdf[i] <- mean(g) / sqrt(2 * pi) + 0.5
}

knitr::kable(cbind(x, cdf, Phi = pnorm(x)))
x cdf Phi
0.1000000 0.5398280 0.5398278
0.3666667 0.6430736 0.6430662
0.6333333 0.7367804 0.7367420
0.9000000 0.8160509 0.8159399
1.1666667 0.8785699 0.8783275
1.4333333 0.9245655 0.9241187
1.7000000 0.9561676 0.9554345
1.9666667 0.9764927 0.9753892
2.2333333 0.9887901 0.9872365
2.5000000 0.9958645 0.9937903

\(f_0(x) = 1, 0 < x < 1\)

\(f_1(x) = - 0.08 x + 0.4, 0 \leq x < 5\)

\(f_2(x) = (1 + x^2)^{-1} / \pi, -\infty < x < \infty\)

estimate <- data.frame(importance_function = character(3), 
                       theta_hat = numeric(3), 
                       variance = numeric(3), 
                       stringsAsFactors = FALSE)

g <- function(x) {
  (exp(-x^2 / 2) / sqrt(2 * pi)) * (x > 0) * (x < 1.2)
}

pnorm(1.2) - 0.5
## [1] 0.3849303
integrate(g, 0, 1.2)
## 0.3849303 with absolute error < 4.3e-15
#f0
f0 <- function(x) 1
x <- runif(m, 0, 1.2)
fg <- 1.2 * g(x) / f0(x)
estimate$importance_function[1] <- "f0"
estimate$theta_hat[1] <- mean(fg)
estimate$variance[1] <- var(fg)

#f1; use rejection sampling get x_i's
f1 <- function(x) (- 0.08 * x + 0.4)
u <- runif(m * 10, 0, 5)
y <- runif(m * 10, 0, 1)

p <- sapply(u, function(z) 
  if (z >= 0 && z < 5) {
    - 0.08 * z + 0.4
  } else {
    0
  }
)
x <- vector("numeric", m)
j <- 1
for (i in 1:length(u)) {
  if(y[i] < p[i]) {
    x[j] <- u[i]
    j <- j + 1
  }
  if (j == m + 1) {
    break
  }
}

i <- c(which(x > 1.2), which(x < 0))
x[i] <- 2
fg <- g(x) / f1(x)
estimate$importance_function[2] <- "f1"
estimate$theta_hat[2] <- mean(fg)
estimate$variance[2] <- var(fg)

#f2
x <- rcauchy(m)
i <- c(which(x > 1.2), which(x < 0))
x[i] <- 2
fg <- g(x) / f2(x)
estimate$importance_function[3] <- "f2"
estimate$theta_hat[3] <- mean(fg)
estimate$variance[3] <- var(fg)

knitr::kable(estimate)
importance_function theta_hat variance
f0 0.3856881 0.0058824
f1 0.3922011 0.2087144
f2 0.3828775 0.3843178
ggplot(data.frame(x = seq(0.01, 1.19, 0.01)), aes(x)) +
  coord_cartesian(ylim = c(0, 1)) +
  stat_function(fun = g, aes(linetype = "g", color = "g")) +
  stat_function(fun = f0, aes(linetype = "f0", color = "f0")) +
  stat_function(fun = f1, aes(linetype = "f1", color = "f1")) +
  stat_function(fun = f2, aes(linetype = "f2", color = "f2")) +
  scale_linetype_manual(name = "", 
                        values = c("g" = "solid",
                                   "f0" = "dashed",
                                   "f1" = "dotted",
                                   "f2" = "dotdash"),
                        breaks = c("g", "f0", "f1", "f2")) +
  scale_color_manual(name = "", 
                     values = c("g" = "red",
                                "f0" = "black",
                                "f1" = "black",
                                "f2" = "black"),
                     breaks = c("g", "f0", "f1", "f2")) +
  ylab("f(x)")

ggplot(data.frame(x = seq(0.01, 1.19, 0.01)), aes(x)) +
  # coord_cartesian(ylim = c(0, 1)) +
  stat_function(fun = function(x) g(x) / f0(x), aes(linetype = "f0", color = "f0")) +
  stat_function(fun = function(x) g(x) / f1(x), aes(linetype = "f1", color = "f1")) +
  stat_function(fun = function(x) g(x) / f2(x), aes(linetype = "f2", color = "f2")) +
  scale_linetype_manual(name = "", 
                        values = c("f0" = "dashed",
                                   "f1" = "dotted",
                                   "f2" = "dotdash"),
                        breaks = c("f0", "f1", "f2")) +
  scale_color_manual(name = "", 
                     values = c("f0" = "black",
                                "f1" = "black",
                                "f2" = "black"),
                     breaks = c("f0", "f1", "f2")) +
  ylab("g(x) / f(x)")

References

Rizzo, Maria L. 2008. Statistical Computing with R. Chapter 5: Monte Carlo Integration and Variance Reduction.