require(latticeExtra)
require(mosaic)
require(Hmisc)
require(kable)
library(kableExtra)
Simulating Sampling Distribution of Mean for three different sample sizes, n = 5, n = 25 and n = 100 from exponential distribution with mean 5 and visualizing it using box plots.
set.seed(1234)
lambda<-2
numsim<-1000
mean5<-rep(0,numsim)
mean25<-mean5 # lines 27 and 28 create vectors of correct length to store the results
mean100<-mean5
mean1000<-mean5 # added a sample size of 1000 to equal array extent
for (i in 1:numsim){ #lines 30-33 sample from the distribution and store the results
mean5[i]<-mean(rexp(5,lambda))
mean25[i]<-mean(rexp(25,lambda))
mean100[i]<-mean(rexp(100,lambda))
mean1000[i]<-mean(rexp(1000,lambda))
}
boxplot(mean5,mean25,mean100,names=c("n=5","n=25","n=100"))
title("Distribution of Mean of Exponentials")
dt <- rbind(mean(mean5), mean(mean25), mean(mean100), mean(mean1000))
dtnames <- c("Sample Size of 5", "Sample Size of 25", "Sample Size of 100", "Sample Size of 1000")
dttitle <- "Mean"
dimnames(dt) <- list(dtnames, dttitle)
dt %>%
kbl(caption = "Monte Carlo Means") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Mean | |
|---|---|
| Sample Size of 5 | 0.5118926 |
| Sample Size of 25 | 0.4992561 |
| Sample Size of 100 | 0.5016852 |
| Sample Size of 1000 | 0.5004936 |
Here, we looked at the sampling distribution of the mean of a set of independent and identically distributed exponential random variables. Look how the variance of the estimator is getting smaller with increase in sample size.The box plot is displayed and you can see that the sampling distribution appears to be symmetric when samples are of size 25 or greater, and the distribution is less variable when n = 100. By CLT, the mean should have normal distribution or at least approximate normal, for large sample sizes, regardless of the originating distribution from which samples came.
Compare three estimators of the mean (\(\mu\)) of a standard normal distribution. Comparing sample mean, trimmed mean and medians estimators of population mean. If the distribution of data is symmetric (as Standard Normal distribution of our example) all three estimators indeed estimate population mean but if the population from which sample is coming from is skewed then they do not.
set.seed(1234)
s<-1000 # declaring # of rows
n<-15
mu<-1
sigma <- sqrt(5/3)
generate.normal <- function(s, n, mu, sigma){ #Function to generate S data sets of size n from normal distribution with mean mu and variance sigma^2
dat <- matrix(rnorm(n*s, mu, sigma), ncol=n, byrow=T) #Note: for Normal data generation, we can get the data in one step like this, which requires no looping.
out <- list(dat=dat)
return(out)
}
trimmean <- function(Y){mean(Y, 0.2)} #Function to compute the 20% trimmed mean
out <- generate.normal(s, n, mu, sigma) #generate normal samples
outsampmean <- apply(out$dat, 1, mean) #get the sample mean from the data set
outtrimmean <- apply(out$dat, 1, trimmean) #get the sample trimmed mean from the data set
outmedian <- apply(out$dat, 1, median) #get the sample median from the data set
summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean,
median=outmedian)
simsum <- function(dat, trueval){
# Function to calculate summary statistics across the S data sets
# mu is the true value of the mean for the true distribution of the data
S <- nrow(dat)
MCmean <- apply(dat,2,mean)
MCbias <- MCmean-trueval
MCrelbias <- MCbias/trueval
MCstddev <- sqrt(apply(dat,2,var))
MCMSE <- apply((dat-trueval)^2,2,mean)
MCRE <- MCMSE[1]/MCMSE
sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE,
MCRE)
names <- c("True Value","Number of Simulations","MC Mean","MC Bias","MC Relative Bias",
"MC Standard Deviation","MC MSE","MC Relative Efficiency")
ests <- c("Sample mean","Trimmed Mean","Median")
dimnames(sumdat) <- list(names,ests)
round(sumdat,5)
}
dt <- simsum(summary.sim, mu)
round(dt, 3)
## Sample mean Trimmed Mean Median
## True Value 1.000 1.000 1.000
## Number of Simulations 1000.000 1000.000 1000.000
## MC Mean 1.007 1.008 1.022
## MC Bias 0.007 0.008 0.022
## MC Relative Bias 0.007 0.008 0.022
## MC Standard Deviation 0.330 0.350 0.398
## MC MSE 0.109 0.123 0.159
## MC Relative Efficiency 1.000 0.887 0.685
dt2 <- round(dt, 3)
dt2 %>%
kbl(caption = "Normal Distribution") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Sample mean | Trimmed Mean | Median | |
|---|---|---|---|
| True Value | 1.000 | 1.000 | 1.000 |
| Number of Simulations | 1000.000 | 1000.000 | 1000.000 |
| MC Mean | 1.007 | 1.008 | 1.022 |
| MC Bias | 0.007 | 0.008 | 0.022 |
| MC Relative Bias | 0.007 | 0.008 | 0.022 |
| MC Standard Deviation | 0.330 | 0.350 | 0.398 |
| MC MSE | 0.109 | 0.123 | 0.159 |
| MC Relative Efficiency | 1.000 | 0.887 | 0.685 |
Here, we can see that the sample mean is a better estimator than the trimmed mean and median. It is known we want the least biased estimator, and it is shown that the sample mean has the smallest bias in comparison to the trimmed and median. Along with having the smallest bias, it also has the smallest MSE, smallest standard deviation, and it is the most efficient (closest to or at 1). Thus the sample mean is closest to the true mean and is the most accurate normal distribution estimator.
set.seed(1234)
s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)
generate.gamma <- function(S,n,mu,sigma){ #Function to generate S data sets of size n from gamma distribution with mean mu, variance sigma^2 mu^2
a <- 1/(sigma^2)
s <- mu/a
dat <- matrix(rgamma(n*S,shape=a,scale=s),ncol=n,byrow=T)
out <- list(dat=dat)
return(out)
}
trimmean <- function(Y){mean(Y,0.2)}
out <- generate.gamma(s,n,mu,sigma)
outsampmean <- apply(out$dat, 1, mean)
outtrimmean <- apply(out$dat, 1, trimmean)
outmedian <- apply(out$dat, 1, median)
summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean,
median=outmedian)
simsum <- function(dat, trueval){
S <- nrow(dat)
MCmean <- apply(dat,2,mean)
MCbias <- MCmean-trueval
MCrelbias <- MCbias/trueval
MCstddev <- sqrt(apply(dat,2,var))
MCMSE <- apply((dat-trueval)^2,2,mean)
MCRE <- MCMSE[1]/MCMSE
sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE,
MCRE)
names <- c("True Value","Number of Simulations","MC Mean","MC Bias","MC Relative Bias",
"MC Standard Deviation","MC MSE","MC Relative Efficiency")
ests <- c("Sample mean","Trimmed Mean","Median")
dimnames(sumdat) <- list(names,ests)
round(sumdat,5)
}
dt <- simsum(summary.sim, mu)
round(dt, 3)
## Sample mean Trimmed Mean Median
## True Value 1.000 1.000 1.000
## Number of Simulations 1000.000 1000.000 1000.000
## MC Mean 1.007 0.691 0.587
## MC Bias 0.007 -0.309 -0.413
## MC Relative Bias 0.007 -0.309 -0.413
## MC Standard Deviation 0.335 0.276 0.303
## MC MSE 0.112 0.172 0.262
## MC Relative Efficiency 1.000 0.652 0.426
dt2 <- round(dt, 3)
dt2 %>%
kbl(caption = "Gamma Distribution") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Sample mean | Trimmed Mean | Median | |
|---|---|---|---|
| True Value | 1.000 | 1.000 | 1.000 |
| Number of Simulations | 1000.000 | 1000.000 | 1000.000 |
| MC Mean | 1.007 | 0.691 | 0.587 |
| MC Bias | 0.007 | -0.309 | -0.413 |
| MC Relative Bias | 0.007 | -0.309 | -0.413 |
| MC Standard Deviation | 0.335 | 0.276 | 0.303 |
| MC MSE | 0.112 | 0.172 | 0.262 |
| MC Relative Efficiency | 1.000 | 0.652 | 0.426 |
Again, here the sample mean is the closest MC mean, smallest bias, most efficient, and smallest MSE. Thus the sample mean is closest to the true mean and is the most accurate gamma distribution estimator in comparison to the 20% trimmed mean and median.
set.seed(1234)
S <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)
generate.t <- function(S,n,mu,df){
dat <- matrix(mu + rt(n*S,df),ncol=n,byrow=T)
out <- list(dat=dat)
return(out)
}
trimmean <- function(Y){mean(Y,0.2)}
out <- generate.t(S,n,mu,5)
outsampmean <- apply(out$dat,1,mean)
outtrimmean <- apply(out$dat,1,trimmean)
outmedian <- apply(out$dat,1,median)
summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean, median=outmedian)
simsum2 <- function(dat,trueval){ S <- nrow(dat)
MCmean <- apply(dat,2,mean)
MCbias <- MCmean-trueval
MCrelbias <- MCbias/trueval
MCstddev <- sqrt(apply(dat,2,var))
MCMSE <- apply((dat-trueval)^2,2,mean)
MCRE <- MCMSE[1]/MCMSE
sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE, MCRE)
names <- c("True Value","Number of Simulations","MC Mean","MC Bias","MC Relative Bias",
"MC Standard Deviation","MC MSE","MC Relative Efficiency")
ests <- c("Sample mean","Trimmed mean","Median")
dimnames(sumdat) <- list(names,ests)
round(sumdat,5)
}
dt <- simsum(summary.sim, mu)
round(dt, 3)
## Sample mean Trimmed Mean Median
## True Value 1.000 1.000 1.000
## Number of Simulations 1000.000 1000.000 1000.000
## MC Mean 0.989 0.998 1.006
## MC Bias -0.011 -0.002 0.006
## MC Relative Bias -0.011 -0.002 0.006
## MC Standard Deviation 0.333 0.308 0.348
## MC MSE 0.111 0.095 0.121
## MC Relative Efficiency 1.000 1.165 0.915
dt2 <- round(dt, 3)
dt2 %>%
kbl(caption = "t-Distribution") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Sample mean | Trimmed Mean | Median | |
|---|---|---|---|
| True Value | 1.000 | 1.000 | 1.000 |
| Number of Simulations | 1000.000 | 1000.000 | 1000.000 |
| MC Mean | 0.989 | 0.998 | 1.006 |
| MC Bias | -0.011 | -0.002 | 0.006 |
| MC Relative Bias | -0.011 | -0.002 | 0.006 |
| MC Standard Deviation | 0.333 | 0.308 | 0.348 |
| MC MSE | 0.111 | 0.095 | 0.121 |
| MC Relative Efficiency | 1.000 | 1.165 | 0.915 |
Here, the trimmed mean is the closest MC mean, smallest bias, most efficient, and smallest MSE. Thus, the trimmed mean is closest to the true mean and is the most accurate t-distribution estimator in comparison to the sample mean and median.
This is an illustration of basic Monte Carlo estimation. We generate a sample of size 2 from normal distribution and target parameter is the absolute value of difference in means.
set.seed(1234)
mu <- 1
m <- 1000
g <- numeric(m)
MSE.Num <-numeric(m)
for (i in 1:m) {
x <- rnorm(2)
g[i] <- abs(x[1] - x[2])
MSE.Num[i]<-(g[i]-mu)^2 # MSE calculation of the numerator
}
est <- mean(g)
bias.g<-est-0 # Monte Carlo Bias, note mu ==0
MC.SE<-sd(g) # MC Standard error
var.theor<-2-(4/pi) # theoretical variance
MC.MSE<-sum(MSE.Num)/m # Monte Carlo MSE
dt <- round(rbind(bias.g, est, var.theor, MC.SE, MC.MSE),3)
names <- c("Bias", "Mean Difference", "Theoretical Variance", "MC Standard Error", "MC MSE")
dimnames(dt) <- list(names)
dt %>%
kbl(caption = "Monte Carlo Values") %>%
kable_classic(full_width = F, html_font = "Cambria")
| Bias | 1.097 |
| Mean Difference | 1.097 |
| Theoretical Variance | 0.727 |
| MC Standard Error | 0.834 |
| MC MSE | 0.704 |