chapter3

#第三章随机变量生成方式 ## 3.1 引言 ###例3.1（有限总体抽样）

#有放回
sample(0:1,size = 10, replace=TRUE)

##  [1] 1 1 0 1 1 0 1 0 0 0

#无放回
sample(1:100,size= 6, replace=FALSE)

## [1] 40 77 53 73  8 17

#无放回随机排列
sample(letters)

##  [1] "b" "z" "v" "p" "h" "l" "t" "o" "m" "y" "c" "n" "d" "a" "r" "u" "x" "q" "w"
## [20] "f" "e" "j" "s" "i" "g" "k"

#有放回地按概率分布抽取,并生成频数表
x<-sample(1:3,size=100,replace=TRUE,prob=c(.2,.3,.5)) 
table(x)

## x
##  1  2  3 
## 22 36 42

##3.2 逆变换法 ###例3.2连续情形下的逆变换法

n<-1000
u<-runif(n)
x<-u^(1/3)
hist(x,probability = TRUE,main=bquote(f(x)==3*x^2))
y<-seq(0,1,.01)
lines(y,3*y^2)

hist(x,prob=TRUE,main=expression(f(x)==3*x^2))

###例3.4两点分布

n<-1000
p<-0.4
u<-runif(n)
x<-as.integer(u>0.6)
mean(x)

## [1] 0.412

var(x)

## [1] 0.2424985

###例3.5几何分布

n<-1000
p<-0.25
u<-runif(n)
k<-ceiling(log(1-u)/log(1-p))-1
table(k)

## k
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
## 263 181 141 102  80  54  53  36  23  19  14   6   9   6   3   1   2   3   1   1 
##  23  24 
##   1   1

###例3.6对数分布

rlogarithmic <- function(n, theta) {
#returns a random logarithmic(theta) sample size n
u <- runif(n)
#set the initial length of cdf vector
N <- ceiling(-16 / log10(theta))
k <- 1:N
a <- -1/log(1-theta)
fk <- exp(log(a) + k * log(theta) - log(k))
Fk <- cumsum(fk)
x <- integer(n)
for (i in 1:n) {
x[i] <- as.integer(sum(u[i] > Fk)) #F^{-1}(u)-1
while (x[i] == N) {
#if x==N we need to extend the cdf
#very unlikely because N is large
logf <- log(a) + (N+1)*log(theta) - log(N+1)
fk <- c(fk, exp(logf))
Fk <- c(Fk, Fk[N] + fk[N+1])
N <- N + 1
x[i] <- as.integer(sum(u[i] > Fk))}}
x + 1}
n <- 1000
theta <- 0.5
x <- rlogarithmic(n, theta)
#compute density of logarithmic(theta) for comparison
k <- sort(unique(x))
p <- -1 / log(1 - theta) * theta^k / k
se <- sqrt(p*(1-p)/n) #standard error
round(rbind(table(x)/n, p, se),3)

##        1     2     3     4     5     6     7
##    0.700 0.190 0.078 0.020 0.006 0.003 0.003
## p  0.721 0.180 0.060 0.023 0.009 0.004 0.002
## se 0.014 0.012 0.008 0.005 0.003 0.002 0.001

##3.3拒绝接受法 ###例3.7接受拒绝法

n <- 1000
k <- 0 #counter for accepted
j <- 0 #iterations
y <- numeric(n)
while (k < n) {
u <- runif(1)
j <- j + 1
x <- runif(1) #random variate from g
if (x * (1-x) > u) {
#we accept x
k <- k + 1
y[k] <- x}}
j

## [1] 5761

#compare empirical and theoretical percentiles
p <- seq(.1, .9, .1)
Qhat <- quantile(y, p) #quantiles of sample
Q <- qbeta(p, 2, 2) #theoretical quantiles
se <- sqrt(p * (1-p) / (n * dbeta(Q, 2, 2)^2))
round(rbind(Qhat, Q, se), 3)

##        10%   20%   30%   40%   50%   60%   70%   80%   90%
## Qhat 0.189 0.300 0.375 0.438 0.510 0.583 0.646 0.724 0.811
## Q    0.196 0.287 0.363 0.433 0.500 0.567 0.637 0.713 0.804
## se   0.010 0.010 0.010 0.011 0.011 0.011 0.010 0.010 0.010

##3.4其他变化法 ###例3.8贝塔分布

n <- 1000
a <- 3
b <- 2
u <- rgamma(n, shape=a, rate=1)
v <- rgamma(n, shape=b, rate=1)
x <- u / (u + v)
q <- qbeta(ppoints(n), a, b)
qqplot(q, x, cex=0.25, xlab="Beta(3, 2)", ylab="Sample")
abline(0, 1)

###例3.9对数分布

n <- 1000
theta <- 0.5
u <- runif(n) #generate logarithmic sample
v <- runif(n)
x <- floor(1 + log(v) / log(1 - (1 - theta)^u))
k <- 1:max(x) #calc. logarithmic probs.
p <- -1 / log(1 - theta) * theta^k / k
se <- sqrt(p*(1-p)/n)
p.hat <- tabulate(x)/n
print(round(rbind(p.hat, p, se), 3))

##        [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9]
## p.hat 0.726 0.176 0.061 0.023 0.007 0.004 0.001 0.001 0.001
## p     0.721 0.180 0.060 0.023 0.009 0.004 0.002 0.001 0.000
## se    0.014 0.012 0.008 0.005 0.003 0.002 0.001 0.001 0.001

rlogarithmic <- function(n, theta) {
stopifnot(all(theta > 0 & theta < 1))
th <- rep(theta, length=n)
u <- runif(n)
v <- runif(n)
x <- floor(1 + log(v) / log(1 - (1 - th)^u))
return(x)}

##3.5求和变换与混合 ###例3.10卡方分布

n <- 1000
nu <- 2
X <- matrix(rnorm(n*nu), n, nu)^2 #matrix of sq. normals
#sum the squared normals across each row: method 1
y <- rowSums(X)
#method 2
y <- apply(X, MARGIN=1, FUN=sum) #a vector length n
mean(y)

## [1] 2.057338

mean(y^2)

## [1] 8.609749

###例3.11卷积和混合

n <- 1000
x1 <- rgamma(n, 2, 2)
x2 <- rgamma(n, 2, 4)
s <- x1 + x2 #the convolution
u <- runif(n)
k <- as.integer(u > 0.5) #vector of 0’s and 1’s
x <- k * x1 + (1-k) * x2 #the mixture
par(mfcol=c(1,2)) #two graphs per page
hist(s, prob=TRUE, xlim=c(0,5), ylim=c(0,1))
hist(x, prob=TRUE, xlim=c(0,5), ylim=c(0,1))

par(mfcol=c(1,1)) #restore display

###例3.12多个伽马分布的混合变量

n <- 5000
k <- sample(1:5, size=n, replace=TRUE, prob=(1:5)/15)
rate <- 1/k
x <- rgamma(n, shape=3, rate=rate)
#plot the density of the mixture
#with the densities of the components
plot(density(x), xlim=c(0,40), ylim=c(0,.3),
lwd=3, xlab="x", main="")
for (i in 1:5)
lines(density(rgamma(n, 3, 1/i)))

###例3.13/3.14绘制混合变量的密度图

n<-5000
p<-c(.1,.2,.2,.3,.2)
lambda<-c(1,1.5,2,2.5,3)
k<-sample(1:5,size=n,replace=TRUE,prob=p)
rate<-lambda[k]
x<-rgamma(n,shape=3,rate=rate)
k[1:8]

## [1] 5 4 2 5 4 5 5 3

rate[1:8]

## [1] 3.0 2.5 1.5 3.0 2.5 3.0 3.0 2.0

f<-function(x,lambda,theta)
{sum(dgamma(x,3,lambda)*theta)}
x<-seq(0,8,length=200)
dim(x)<-length(x)
y<-apply(x,1,f,lambda=lambda,theta=p)
plot(x,y,type="l",ylim=c(0,.85),lwd=3,ylab="density")
for(j in 1:5){y<-apply(x,1,dgamma,shape=3,rate=lambda[j])
lines(x,y)}

###例3.15泊松-伽马混合分布

n<-1000
r<-4
beta<-3
lambda<-rgamma(n,r,beta)
x<-rpois(n,lambda)
mix<-tabulate(x+1)/n
negbin<-round(dnbinom(0:max(x),r,beta/(1+beta)),3)
se<-sqrt(negbin*(1-negbin)/n)
round(rbind(mix,negbin,se),3)

##         [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]
## mix    0.289 0.368 0.182 0.090 0.042 0.014 0.011 0.004
## negbin 0.316 0.316 0.198 0.099 0.043 0.017 0.006 0.002
## se     0.015 0.015 0.013 0.009 0.006 0.004 0.002 0.001

# 创建数据框
df <- data.frame(
  x = 0:max(x),
  mix = mix,
  negbin = negbin,
  lower = negbin - 1.96 * se,
  upper = negbin + 1.96 * se
)
# 绘制比较图
library(ggplot2)

## Warning: 程序包'ggplot2'是用R版本4.4.2 来建造的

ggplot(df, aes(x = x)) +
  geom_point(aes(y = mix, color = "Observed Mix")) +
  geom_line(aes(y = mix, color = "Observed Mix"), linetype = "dashed", alpha = 0.5) +
  geom_point(aes(y = negbin, color = "Negative Binomial")) +
  geom_line(aes(y = negbin, color = "Negative Binomial"), linetype = "solid", alpha = 0.5) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, color = "darkgray") +
  scale_color_manual(values = c("Observed Mix" = "red", "Negative Binomial" = "blue")) +
  labs(
    title = "Comparison of Observed Mix and Negative Binomial Distribution",
    x = "Count",
    y = "Probability",
    color = "Distribution"
  ) +
  theme_minimal() +
  coord_cartesian(xlim = c(0, max(df$x[df$mix > 0 | df$negbin > 0.001])))  # 调整x轴范围以聚焦有数据的区域

##3.6多元分布 ###例3.16谱分解法

mu<-c(0,0)
sigma<-matrix(c(1,.9,.9,1) , nrow=2,ncol=2)
rmvn.eigen<-function(n,mu,sigma){d<-length(mu)
ev <- eigen(sigma,symmetric=TRUE)
lambda <- ev$values
V<-ev$vectors
R<-V%*%diag(sqrt(lambda))%*%t(V)
Z<-matrix(rnorm(n*d),nrow=n,ncol=d)
X<-Z%*%R+matrix(mu,n,d,byrow=TRUE)}
X<-rmvn.eigen(10000,mu,sigma)
plot(X,xlab="x",ylab="y",pch=20)

###例3.17SVD法

rmvn.svd <-
function(n, mu, Sigma) {
# generate n random vectors from MVN(mu, Sigma)
# dimension is inferred from mu and Sigma
d <- length(mu)
S <- svd(Sigma)
R <- S$u %*% diag(sqrt(S$d)) %*% t(S$v) #sq. root Sigma
Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
X <- Z %*% R + matrix(mu, n, d, byrow=TRUE)
X
}

###例3.18choleski分解法

rmvn.Choleski <-
function(n, mu, Sigma) {
# generate n random vectors from MVN(mu, Sigma)
# dimension is inferred from mu and Sigma
d <- length(mu)
Q <- chol(Sigma) # Choleski factorization of Sigma
Z <- matrix(rnorm(n*d), nrow=n, ncol=d)
X <- Z %*% Q + matrix(mu, n, d, byrow=TRUE)
X}
y <- subset(x=iris, Species=="virginica")[, 1:4]
mu <- colMeans(y)
Sigma <- cov(y)
mu

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##        6.588        2.974        5.552        2.026

Sigma

##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length   0.40434286  0.09376327   0.30328980  0.04909388
## Sepal.Width    0.09376327  0.10400408   0.07137959  0.04762857
## Petal.Length   0.30328980  0.07137959   0.30458776  0.04882449
## Petal.Width    0.04909388  0.04762857   0.04882449  0.07543265

X <- rmvn.Choleski(200, mu, Sigma)
pairs(X)

###例3.19MVN生成程序性能比较

library(MASS)
library(mvtnorm)

## Warning: 程序包'mvtnorm'是用R版本4.4.2 来建造的

n <- 100 #sample size
d <- 30 #dimension
N <- 2000 #iterations
mu <- numeric(d)

set.seed(100)
system.time(for (i in 1:N)
rmvn.eigen(n, mu, cov(matrix(rnorm(n*d), n, d))))

## 用户 系统 流逝 
## 0.56 0.05 1.27

set.seed(100)
system.time(for (i in 1:N)
rmvn.svd(n, mu, cov(matrix(rnorm(n*d), n, d))))

## 用户 系统 流逝 
## 0.69 0.01 1.39

set.seed(100)
system.time(for (i in 1:N)
rmvn.Choleski(n, mu, cov(matrix(rnorm(n*d), n, d))))

## 用户 系统 流逝 
## 0.58 0.05 0.92

set.seed(100)
system.time(for (i in 1:N)
mvrnorm(n, mu, cov(matrix(rnorm(n*d), n, d))))

## 用户 系统 流逝 
## 0.67 0.12 1.27

set.seed(100)
system.time(for (i in 1:N)
rmvnorm(n, mu, cov(matrix(rnorm(n*d), n, d))))

## 用户 系统 流逝 
## 0.66 0.14 1.69

set.seed(100)
system.time(for (i in 1:N)
cov(matrix(rnorm(n*d), n, d)))

## 用户 系统 流逝 
## 0.19 0.02 0.44

###例3.20多元正态的混合分布

library(MASS) #for mvrnorm
#inefficient version loc.mix.0 with loops
loc.mix.0 <- function(n, p, mu1, mu2, Sigma) {
#generate sample from BVN location mixture
X <- matrix(0, n, 2)
for (i in 1:n) {
k <- rbinom(1, size = 1, prob = p)
if (k)
X[i,] <- mvrnorm(1, mu = mu1, Sigma) else
X[i,] <- mvrnorm(1, mu = mu2, Sigma)}
return(X)}
#more efficient version
loc.mix <- function(n, p, mu1, mu2, Sigma) {
#generate sample from BVN location mixture
n1 <- rbinom(1, size = n, prob = p)
n2 <- n - n1
x1 <- mvrnorm(n1, mu = mu1, Sigma)
x2 <- mvrnorm(n2, mu = mu2, Sigma)
X <- rbind(x1, x2) #combine the samples
return(X[sample(1:n), ]) #mix them
}
x <- loc.mix(1000, .5, rep(0, 4), 2:5, Sigma = diag(4))
r <- range(x) * 1.2
par(mfrow = c(2, 2))
for (i in 1:4)
hist(x[ , i], xlim = r, ylim = c(0, .3), freq = FALSE,
main = "", breaks = seq(-5, 10, .5))

par(mfrow = c(1, 1))

###例3.21生成球面上的随机变量

runif.sphere <- function(n, d) {
# return a random sample uniformly distributed
# on the unit sphere in R ^d
M <- matrix(rnorm(n*d), nrow = n, ncol = d)
L <- apply(M, MARGIN = 1,
FUN = function(x){sqrt(sum(x*x))})
D <- diag(1 / L)
U <- D %*% M
U}
#generate a sample in d=2 and plot
X <- runif.sphere(200, 2)
par(pty = "s")
plot(X, xlab = bquote(x[1]), ylab = bquote(x[2]))

par(pty = "m")

#练3.2

#定义标准拉普拉斯分布的逆累积分布函数（CDF）
inverse_cdf_laplace <- function(u) {
  ifelse(u < 0.5, log(2 * u), -log(2 * (1 - u)))}
#设置样本数量
n_samples <- 1000
#生成0到1之间的均匀随机数
uniform_samples <- runif(n_samples, min = 0, max = 1)
#使用逆CDF将均匀样本转换为拉普拉斯样本
laplace_samples <- sapply(uniform_samples, inverse_cdf_laplace)
#绘制直方图
hist(laplace_samples, breaks = 50, probability = TRUE, col = "lightgreen", border = "black",
     main = "生成样本直方图与真实拉普拉斯PDF的比较",
     xlab = "x", ylab = "密度")
#添加真实拉普拉斯PDF曲线
curve(0.5 * exp(-abs(x)), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("生成样本", "真实拉普拉斯PDF"), col = c("lightgreen", "red"), lty = 1)

#练3.3

#定义Pareto(a, b)分布的逆累积分布函数（CDF）
inverse_cdf_pareto <- function(u, a, b) {
  b / (1 - u)^(1/a)}
a <- 2
b <- 2
n_samples <- 1000
uniform_samples <- runif(n_samples, min = 0, max = 1)
#使用逆CDF将均匀样本转换为Pareto样本
pareto_samples <- sapply(uniform_samples, inverse_cdf_pareto, a = a, b = b)
#绘制密度直方图
hist(pareto_samples, breaks = 50, probability = TRUE, col = "lightblue", border = "black",
     main = "生成样本密度直方图与真实Pareto(2, 2)密度的比较",
     xlab = "x", ylab = "密度")
#添加真实Pareto(2, 2)密度曲线以进行比较
curve((a * b^a) / x^(a + 1), from = b, to = max(pareto_samples), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("生成样本", "真实Pareto(2, 2)密度"), col = c("lightblue", "red"), lty = 1)

#练3.7

#定义Beta(a, b)分布的概率密度函数（PDF）
beta_pdf <- function(x, a, b) {
  dbeta(x, shape1 = a, shape2 = b)}
#定义一个辅助分布（这里选择均匀分布U(0, 1)）
proposal_pdf <- function(x) {
  dunif(x, min = 0, max = 1)}
#计算辅助分布与目标分布的最大比值C
find_max_c <- function(a, b) {
  # 在区间[0, 1]内找到beta_pdf / proposal_pdf的最大值
  x_values <- seq(0, 1, length.out = 1000)
  pdf_ratios <- beta_pdf(x_values, a, b) / proposal_pdf(x_values)
  return(max(pdf_ratios))}
#定义接受-拒绝方法来生成Beta(a, b)分布的样本
generate_beta_sample <- function(n, a, b) {
  c <- find_max_c(a, b)
  samples <- numeric(n)
  i <- 1
  while (i <= n) {
    y <- runif(1, min = 0, max = 1)          # 从均匀分布中抽取y
    u <- runif(1, min = 0, max = 1)          # 从均匀分布中抽取u
    if (u <= beta_pdf(y, a, b) / (c * proposal_pdf(y))) {
      samples[i] <- y
      i <- i + 1}} 
  return(samples)}
a <- 3
b <- 2
n_samples <- 1000
beta_samples <- generate_beta_sample(n_samples, a, b)
#绘制直方图
hist(beta_samples, breaks = 50, probability = TRUE, col = "lightblue", border = "black",
     main = "生成样本直方图与理论Beta(3, 2)密度的比较",
     xlab = "x", ylab = "密度")
#添加理论Beta(3, 2)密度曲线
curve(dbeta(x, shape1 = a, shape2 = b), from = 0, to = 1, add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("生成样本", "理论Beta(3, 2)密度"), col = c("lightblue", "red"), lty = 1)

#练3.8

#定义生成Lognormal分布随机变量的函数
generate_lognormal_sample <- function(n, mu, sigma) {
  #从标准正态分布中生成n个随机样本
  z_samples <- rnorm(n, mean = 0, sd = 1)
  #使用变换方法将标准正态样本转换为Lognormal样本
  lognormal_samples <- exp(mu + sigma * z_samples)
  return(lognormal_samples)}
mu <- 0
sigma <- 1
n_samples <- 1000
#生成Lognormal(µ, σ)分布的样本
lognormal_samples <- generate_lognormal_sample(n_samples, mu, sigma)
hist(lognormal_samples, breaks = 50, probability = TRUE, col = "lightblue", border = "black",
     main = "生成样本直方图与理论Lognormal(0, 1)密度的比较",
     xlab = "x", ylab = "密度")
#添加理论Lognormal(µ, σ)密度曲线
curve(dlnorm(x, meanlog = mu, sdlog = sigma), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("生成样本", "理论Lognormal(0, 1)密度"), col = c("lightblue", "red"), lty = 1)

#练3.11

#定义生成Lognormal分布随机变量的函数
generate_lognormal_sample <- function(n, mu, sigma) {
  #从标准正态分布中生成n个随机样本
  z_samples <- rnorm(n, mean = 0, sd = 1)
  #使用变换方法将标准正态样本转换为Lognormal样本
  lognormal_samples <- exp(mu + sigma * z_samples)
  return(lognormal_samples)}
mu <- 0
sigma <- 1
n_samples <- 1000
#生成Lognormal(µ, σ)分布的样本
lognormal_samples <- generate_lognormal_sample(n_samples, mu, sigma)
#绘制直方图
hist(lognormal_samples, breaks = 50, probability = TRUE, col = "lightblue", border = "black",
     main = "生成样本直方图与理论Lognormal(0, 1)密度的比较",
     xlab = "x", ylab = "密度")
# 添加理论Lognormal(µ, σ)密度曲线
curve(dlnorm(x, meanlog = mu, sdlog = sigma), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("生成样本", "理论Lognormal(0, 1)密度"), col = c("lightblue", "red"), lty = 1)

#练3.13

#定义Pareto分布的概率密度函数（PDF）
pareto_pdf <- function(y, r, beta) {
  (r * beta^r) / y^(r + 1)}
#定义Pareto分布的累积分布函数（CDF）
pareto_cdf <- function(y, r, beta) {
  1 - (beta / (beta + y))^r}
#定义生成Pareto(r, beta)分布随机变量的函数
generate_pareto_sample <- function(n, r, beta) {
  #从均匀分布中生成n个随机样本
  u_samples <- runif(n, min = 0, max = 1)
  #使用逆变换方法将均匀样本转换为Pareto样本
  pareto_samples <- beta / ((1 - u_samples)^(1/r)) - beta
  return(pareto_samples)}
r <- 4
beta <- 2
n_samples <- 1000
#生成Pareto(r, beta)分布的样本
pareto_samples <- generate_pareto_sample(n_samples, r, beta)
#绘制直方图
hist(pareto_samples, breaks = 50, probability = TRUE, col = "lightblue", border = "black",
     main = "生成样本直方图与理论Pareto(4, 2)密度的比较",
     xlab = "y", ylab = "密度")
#添加理论Pareto(r, beta)密度曲线以进行比较
curve(pareto_pdf(x, r, beta), from = 0, to = max(pareto_samples), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("生成样本", "理论Pareto(4, 2)密度"), col = c("lightblue", "red"), lty = 1)

chapter3

Jun