require(latticeExtra)
require(mosaic)
require(Hmisc)
library(kableExtra)
library(knitr)
Example 1:
Simulating Sampling Distribution of Mean for three different sample sizes, n=5, n=25 and n=100 from exponential distribution with mean 5 and visualizing it using boxplots.
set.seed(1234)
lambda<-2
numsim<-100
mean5<-rep(0,numsim)
mean25<-mean5
mean100<-mean5
for (i in 1:numsim){
mean5[i]<-mean(rexp(5,lambda))
mean25[i]<-mean(rexp(25,lambda))
mean100[i]<-mean(rexp(100,lambda))
}
boxplot(mean5,mean25,mean100,names=c("n=5","n=25","n=100"))
title("Distribution of Mean of Exponentials")

par(mfcol=c(1,3))
hist(mean5)
lines(density(mean5))
hist(mean25)
lines(density(mean25))
hist(mean100)
lines(density(mean100))

par(mfcol=c(1,1))
a <- c(mean(mean5),mean(mean25),mean(mean100), var(mean5), var(mean25), var(mean100))
tab <- matrix(rep(a, times=1), ncol=3, byrow=TRUE)
colnames(tab) <- c('Mean5', 'Mean25', 'Mean100')
rownames(tab) <- c('Mean', 'Variance')
tab <- as.table(round(tab, digits = 3))
tab %>%
kbl(caption = "Mean of Variance for 3 Sample Sizes") %>%
kable_paper("hover", full_width = F)
Mean of Variance for 3 Sample Sizes
|
|
Mean5
|
Mean25
|
Mean100
|
|
Mean
|
0.492
|
0.493
|
0.496
|
|
Variance
|
0.043
|
0.008
|
0.002
|
Notice how the variance of the estimator is getting smaller with increase in sample size. By CLT, the mean should have normal distribution or at least approximate normal, for large sample sizes, regardless of the orignating distribution from which samples came.
Example 2
Compare three estimators of the mean (μ) of a standard normal distribution. Comparing sample mean, trimmed mean and medians estimators of population mean. If the distribution of data is symmetric (as Standard Normal distribution of our example) all three estimators indeed estimate population mean but if the population from which sample is coming from is skewed then they do not.
set.seed(1234567)
s <- 3 #number of rows
n <- 5 #number of columns
mymatrix <- matrix(rnorm(s*n), nrow = 3, ncol = 5)
#need to add three columns for sample mean, trimmed mean, median
Sample.Mean <- numeric(s)
Trimmed.Mean <- numeric(s)
Median <- numeric(s)
#Notice there are three estimators (mean, trimmed mean and median) whose properties we are comparing
mydata <- cbind(mymatrix, Sample.Mean, Trimmed.Mean, Median)
mydata[,n+1] <- apply(mydata[,1:n],1,mean)
#finding mean for each row/sample
trimmean <- function(Y)
{
mean(Y,0.2)
}
mydata[,n+2] <- apply(mydata[,1:n],1,trimmean)
#finding trim mean for each sample/row
mydata[,n+3] <- apply(mydata[,1:n],1,median)
#finding median for each sample/row
mydata_round <- round(mydata, digits = 4)
kbl(mydata_round, caption = "Comparing Three Estimators Mu") %>%
kable_classic() %>%
add_header_above(c("N(0,1)" = 5, " " = 3))
Comparing Three Estimators Mu
|
N(0,1)
|
|
|
|
|
|
|
|
Sample.Mean
|
Trimmed.Mean
|
Median
|
|
0.1567
|
-1.3508
|
-1.7781
|
-0.1577
|
-0.3782
|
-0.7016
|
-0.6289
|
-0.3782
|
|
1.3738
|
-0.0085
|
0.9095
|
1.1020
|
-0.4986
|
0.5756
|
0.6677
|
0.9095
|
|
0.7307
|
0.3210
|
-0.9194
|
1.5416
|
-0.5413
|
0.2265
|
0.1701
|
0.3210
|
Example 2 Using functions
Simulation to compare sampling properties of three different estimators for the mean of a distribution based on an iid sample of size n * sample mean * 20% trimmed mean * median. Note we can generate samples from gamma, normal, and t to see the difference in estimators.
###Normal Distribution
set.seed(1234)
s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)
generate.normal <- function(s, n, mu, sigma){
dat <- matrix(rnorm(n*s, mu, sigma), ncol=n, byrow=T)
out <- list(dat=dat)
return(out)
}
#function to generate S data sets of size n from normal distribution with mean mu and variance sigma^2
trimmean <- function(Y){mean(Y, 0.2)}
#function to compute the 20% trimmed mean
out <- generate.normal(s, n, mu, sigma)
outsamplemean <- apply(out$dat, 1, mean)
outtrimmean <- apply(out$dat, 1, trimmean)
outmedian <- apply(out$dat, 1, median)
summary.sim <- data.frame(mean = outsamplemean, trim = outtrimmean, median = outmedian)
#function to calculate summary statistics across the S data sets
simsum <- function(dat,trueval){
S <- nrow(dat)
MCmean <- apply(dat,2,mean)
MCbias <- MCmean-trueval
MCrelbias <- MCbias/trueval
MCstddev <- sqrt(apply(dat,2,var))
MCMSE <- apply((dat-trueval)^2,2,mean)
MCRE <- MCMSE[1]/MCMSE
sumdat <- rbind(rep(trueval, 3), S, MCmean, MCbias, MCrelbias, MCstddev, MCMSE, MCRE)
names <- c("true value", "# sims","MC mean", "MC bias", "MC relative bias", "MC standard deviation", "MC MSE", "MC relative efficiency")
ests <- c("Sample mean", "Trimmed mean", "Median")
dimnames(sumdat) <- list(names, ests)
round(sumdat, 5)
}
ntab <- round(simsum(summary.sim, mu), 3)
ntab %>%
kbl(caption = "Normal Distribution") %>%
kable_paper("hover", full_width = F)
Normal Distribution
|
|
Sample mean
|
Trimmed mean
|
Median
|
|
true value
|
1.000
|
1.000
|
1.000
|
|
# sims
|
1000.000
|
1000.000
|
1000.000
|
|
MC mean
|
1.007
|
1.008
|
1.022
|
|
MC bias
|
0.007
|
0.008
|
0.022
|
|
MC relative bias
|
0.007
|
0.008
|
0.022
|
|
MC standard deviation
|
0.330
|
0.350
|
0.398
|
|
MC MSE
|
0.109
|
0.123
|
0.159
|
|
MC relative efficiency
|
1.000
|
0.887
|
0.685
|
Gamma Distribution:
set.seed(1234)
s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)
generate.gamma <- function(s, n, mu, sigma){
a <- 1/(sigma^2)
b <- mu/a
dat1 <- matrix(rgamma(n*s, shape = a, scale = b), ncol=n, byrow=T)
out1 <- list(dat1=dat1)
return(out1)
}
trimmean1 <- function(Y){mean(Y, 0.2)}
out1 <- generate.gamma(s, n, mu, sigma)
outsamplemean1 <- apply(out1$dat1, 1, mean)
outtrimmean1 <- apply(out1$dat1, 1, trimmean)
outmedian1 <- apply(out1$dat1, 1, median)
summary.sim1 <- data.frame(mean = outsamplemean1, trim = outtrimmean1, median = outmedian1)
#function to calculate summary statistics across the S data sets
simsum1 <- function(dat1,trueval){
S1 <- nrow(dat1)
MCmean1 <- apply(dat1,2,mean)
MCbias1 <- MCmean1-trueval
MCrelbias1 <- MCbias1/trueval
MCstddev1 <- sqrt(apply(dat1,2,var))
MCMSE1 <- apply((dat1-trueval)^2,2,mean)
MCRE1 <- MCMSE1[1]/MCMSE1
sumdat1 <- rbind(rep(trueval, 3), S1, MCmean1, MCbias1, MCrelbias1, MCstddev1, MCMSE1, MCRE1)
names1 <- c("true value", "# sims","MC mean", "MC bias", "MC relative bias", "MC standard deviation", "MC MSE", "MC relative efficiency")
ests1 <- c("Sample mean", "Trimmed mean", "Median")
dimnames(sumdat1) <- list(names1, ests1)
round(sumdat1, 5)
}
gtab <- round(simsum1(summary.sim1, mu), 3)
gtab %>%
kbl(caption = "Gamma Distribution") %>%
kable_paper("hover", full_width = F)
Gamma Distribution
|
|
Sample mean
|
Trimmed mean
|
Median
|
|
true value
|
1.000
|
1.000
|
1.000
|
|
# sims
|
1000.000
|
1000.000
|
1000.000
|
|
MC mean
|
1.007
|
0.691
|
0.587
|
|
MC bias
|
0.007
|
-0.309
|
-0.413
|
|
MC relative bias
|
0.007
|
-0.309
|
-0.413
|
|
MC standard deviation
|
0.335
|
0.276
|
0.303
|
|
MC MSE
|
0.112
|
0.172
|
0.262
|
|
MC relative efficiency
|
1.000
|
0.652
|
0.426
|
t - Distribution
set.seed(1234)
s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)
generate.t <- function(s, n, mu, df){
dat2 <- matrix(mu +rt(n*s, df), ncol=n, byrow=T)
out2 <- list(dat2=dat2)
return(out2)
}
trimmean2 <- function(Y){mean(Y, 0.2)}
#function to compute the 20% trimmed mean
out2 <- generate.t(s, n, mu, 5)
outsamplemean2 <- apply(out2$dat2, 1, mean)
outtrimmean2 <- apply(out2$dat2, 1, trimmean)
outmedian2 <- apply(out2$dat2, 1, median)
summary.sim2 <- data.frame(mean = outsamplemean2, trim = outtrimmean2, median = outmedian2)
#function to calculate summary statistics across the S data sets
simsum2 <- function(dat2,trueval){
S2 <- nrow(dat2)
MCmean2 <- apply(dat2,2,mean)
MCbias2 <- MCmean2-trueval
MCrelbias2 <- MCbias2/trueval
MCstddev2 <- sqrt(apply(dat2,2,var))
MCMSE2 <- apply((dat2-trueval)^2,2,mean)
MCRE2 <- MCMSE2[1]/MCMSE2
sumdat2 <- rbind(rep(trueval, 3), S2, MCmean2, MCbias2, MCrelbias2, MCstddev2, MCMSE2, MCRE2)
names2 <- c("true value", "# sims","MC mean", "MC bias", "MC relative bias", "MC standard deviation", "MC MSE", "MC relative efficiency")
ests2 <- c("Sample mean", "Trimmed mean", "Median")
dimnames(sumdat2) <- list(names2, ests2)
round(sumdat2, 5)
}
ttab <- round(simsum2(summary.sim2, mu), 3)
ttab %>%
kbl(caption = "t - Distribution") %>%
kable_paper("hover", full_width = F)
t - Distribution
|
|
Sample mean
|
Trimmed mean
|
Median
|
|
true value
|
1.000
|
1.000
|
1.000
|
|
# sims
|
1000.000
|
1000.000
|
1000.000
|
|
MC mean
|
0.989
|
0.998
|
1.006
|
|
MC bias
|
-0.011
|
-0.002
|
0.006
|
|
MC relative bias
|
-0.011
|
-0.002
|
0.006
|
|
MC standard deviation
|
0.333
|
0.308
|
0.348
|
|
MC MSE
|
0.111
|
0.095
|
0.121
|
|
MC relative efficiency
|
1.000
|
1.165
|
0.915
|
Example 6.1 From textbook
This is an illustration of basic Monte Carlo estimation. We generate a sample of size 2 from normal distribution and target parameter is the absolute value of difference in means.
s <- 1000
g <- numeric(s)
for (i in 1:s) {
x <- rnorm(2,0,1)
g[i] <- abs(x[1] - x[2])
}
MCest <- mean(g)
MCsd <- sd(g)
theorvar <- 2 - (4/pi)
MCmse <- sum(g)/s
bias.g <- MCest-0 # Monte Carlo Bias, note mu ==0
tab1 <- matrix(c(MCest, MCsd, MCmse, theorvar, bias.g), ncol = 5, nrow = 1)
colnames(tab1) <- c("MC Estimate", "MC SD", "MC MSE", "Theor Var", "Bias")
rownames(tab1) <- ""
tab1 <- as.table(round(tab1, digits = 3))
tab1 %>%
kbl(caption = "Monte Carlo Estimation") %>%
kable_paper("hover", full_width = F)
Monte Carlo Estimation
|
|
MC Estimate
|
MC SD
|
MC MSE
|
Theor Var
|
Bias
|
|
|
1.165
|
0.861
|
1.165
|
0.727
|
1.165
|
Example 6.4 Confidence interval for variance:
n <- 20
alpha <- .05
x <- rnorm(n, mean=0, sd=2)
UCL <- (n-1) * var(x) / qchisq(alpha, df=n-1)
tucl <- matrix(c(UCL), ncol = 1, nrow = 1)
colnames(tucl) <- c("Upper Confidence Limit")
rownames(tucl) <- ""
tucl <- round(tucl, digits = 3)
tcul <- as.table(tucl)
tucl %>%
kbl(caption = "Upper Confidence Limit") %>%
kable_paper("hover", full_width = F)
Upper Confidence Limit
|
|
Upper Confidence Limit
|
|
|
3.658
|
Example 6.5 MC estimate of confidence level for variance.
n <- 20
alpha <- .05
UCL1 <- replicate(1000, expr = {
x <- rnorm(n, mean = 0, sd = 2)
(n-1) * var(x) / qchisq(alpha, df = n-1)
} )
#count the number of intervals that contain sigma^2=4
sucl <- sum(UCL1 > 4)/1000
#or we can compute the mean to get the confidence level
mucl <- mean(UCL1 > 4)
tabl <- matrix(c(mucl), ncol = 1, nrow = 1)
colnames(tabl) <- c("MC Estimate")
rownames(tabl) <- ""
tabl <- as.table(tabl)
tabl %>%
kbl(caption = "MC Estimate of Confidence Level for Variance") %>%
kable_paper("hover", full_width = F)
MC Estimate of Confidence Level for Variance
|
|
MC Estimate
|
|
|
0.949
|
set.seed(1234)
n1 <- 10
alpha <- .05
LCL2 <- replicate(10000, expr = {
x <- rnorm(n1, mean = 2, sd = 1)
mean(x) - qt((1 - alpha)/2, df = (n1 - 1)) * (sqrt((var(x))/n1))
} )
UCL2 <- replicate(10000, expr = {
x <- rnorm(n1, mean = 2, sd = 1)
mean(x) + qt((1 - alpha)/2, df = (n1 - 1)) * (sqrt((var(x))/n1))
} )
#or we can compute the mean to get the confidence level
muc2 <- mean(UCL2 > 2)
mlc2 <- mean(LCL2 < 2)
mci <- muc2 + mlc2
tab2 <- matrix(c(mci), ncol = 1, nrow = 1)
colnames(tab2) <- c("MC Estimate")
rownames(tab2) <- ""
tab2 <- as.table(tab2)
tab2 %>%
kbl(caption = "MC Estimate of Confidence Level for Variance") %>%
kable_paper("hover", full_width = F)
MC Estimate of Confidence Level for Variance
|
|
MC Estimate
|
|
|
0.951
|
Example 6.7:
n <- 20
alpha <- .05
mu0 <- 500
sigma <- 100
m <- 10000
p <- numeric(m)
for (j in 1:m) {
x <-rnorm(n, mu0, sigma)
ttest <- t.test(x, alternative = "greater", mu = mu0)
p[j] <- ttest$p.value
}
p.hat <- mean(p < alpha)
se.hat <- sqrt(p.hat * (1 - p.hat)/m)
tab3 <- matrix(c(round(p.hat, 3), round(se.hat, 3)), ncol = 2, nrow = 1)
colnames(tab3) <- c("Alpha Hat", "SE")
rownames(tab3) <- ""
tab3 <- as.table(tab3)
tab3 %>%
kbl(caption = "Empirical Type I Error") %>%
kable_paper("hover", full_width = F)
Empirical Type I Error
|
|
Alpha Hat
|
SE
|
|
|
0.047
|
0.002
|
set.seed(123456)
n <- 20
alpha <- .05
mu0 <- 500
sigma <- 100
m <- 1000
p2 <- numeric(m)
for (k in 1:m){
x2 <- rchisq(n, df = mu0)
ttest2 <- t.test(x2, alternative = "greater", mu = mu0)
p2[k] <- ttest2$p.value
}
p.hat2 <- mean(p2 < alpha)
se.hat2 <- sqrt(p.hat2 * (1-p.hat2) / m)
ctabl <- matrix(c(round(p.hat2, 3), round(se.hat2, 3)), ncol = 2, nrow = 1)
colnames(ctabl) <- c("Alpha Hat MC", "SE Hat")
rownames(ctabl) <- ""
ctabl <- as.table(ctabl)
ctabl %>%
kbl(caption = "Empirical Type I Error Chi-Square") %>%
kable_paper("hover", full_width = F)
Empirical Type I Error Chi-Square
|
|
Alpha Hat MC
|
SE Hat
|
|
|
0.037
|
0.006
|
Example 6.9:
n <- 20
h <- 1000
mu0 <- 500
sigma <- 100
mu <- c(seq(450, 650, 10))
M <- length(mu)
power <- numeric(M)
for (i in 1:M) {
mu1 <- mu[i]
pvalues <- replicate(h, expr = {
x3 <- rnorm(n, mean = mu1, sd = sigma)
ttest3 <- t.test(x3, alternative = "greater", mu = mu0)
ttest3$p.value
})
power[i] <- mean(pvalues <= .05)
}
plot(mu, power)
abline(v = mu0, lty = 1)
abline(h = .05, lty = 1)
