Module 5 Examples

require(latticeExtra)
require(mosaic)
require(Hmisc)
require(kable)
library(kableExtra)

Illustrative Example (pg 4):

Simulating Sampling Distribution of Mean for three different sample sizes, n = 5, n = 25 and n = 100 from exponential distribution with mean 5 and visualizing it using box plots.

set.seed(1234)
lambda<-2
numsim<-1000
mean5<-rep(0,numsim)
mean25<-mean5 # lines 27 and 28 create vectors of correct length to store the results
mean100<-mean5
mean1000<-mean5 # added a sample size of 1000 to equal array extent

for (i in 1:numsim){ #lines 30-33 sample from the distribution and store the results
  mean5[i]<-mean(rexp(5,lambda))
  mean25[i]<-mean(rexp(25,lambda))
  mean100[i]<-mean(rexp(100,lambda))
  mean1000[i]<-mean(rexp(1000,lambda))
  
}

boxplot(mean5,mean25,mean100,names=c("n=5","n=25","n=100")) 
title("Distribution of Mean of Exponentials")

dt <- rbind(mean(mean5), mean(mean25), mean(mean100), mean(mean1000))
dtnames <- c("Sample Size of 5", "Sample Size of 25", "Sample Size of 100", "Sample Size of 1000")
dttitle <- "Mean"
dimnames(dt) <- list(dtnames, dttitle)

dt %>%
  kbl(caption = "Monte Carlo Means") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Monte Carlo Means
	Mean
Sample Size of 5	0.5118926
Sample Size of 25	0.4992561
Sample Size of 100	0.5016852
Sample Size of 1000	0.5004936

Solution:

Here, we looked at the sampling distribution of the mean of a set of independent and identically distributed exponential random variables. Look how the variance of the estimator is getting smaller with increase in sample size.The box plot is displayed and you can see that the sampling distribution appears to be symmetric when samples are of size 25 or greater, and the distribution is less variable when n = 100. By CLT, the mean should have normal distribution or at least approximate normal, for large sample sizes, regardless of the originating distribution from which samples came.

Example Comparing Mean, Trimmed Mean and Median (pg 5):

Compare three estimators of the mean (\(\mu\)) of a standard normal distribution. Comparing sample mean, trimmed mean and medians estimators of population mean. If the distribution of data is symmetric (as Standard Normal distribution of our example) all three estimators indeed estimate population mean but if the population from which sample is coming from is skewed then they do not.

Normal Distribution

set.seed(1234)

s<-1000 # declaring # of rows
n<-15
mu<-1
sigma <- sqrt(5/3)

generate.normal <- function(s, n, mu, sigma){ #Function to generate S data sets of size n from normal distribution with mean mu and variance sigma^2
dat <- matrix(rnorm(n*s, mu, sigma), ncol=n, byrow=T) #Note: for Normal data generation, we can get the data in one step like this, which requires no looping.
out <- list(dat=dat)
  return(out)
}

trimmean <- function(Y){mean(Y, 0.2)} #Function to compute the 20% trimmed mean
out <- generate.normal(s, n, mu, sigma) #generate normal samples
outsampmean <- apply(out$dat, 1, mean) #get the sample mean from the data set
outtrimmean <- apply(out$dat, 1, trimmean) #get the sample trimmed mean from the data set
outmedian <- apply(out$dat, 1, median) #get the sample median from the data set

summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean,
                          median=outmedian)
simsum <- function(dat, trueval){ 
# Function to calculate summary statistics across the S data sets
# mu is the true value of the mean for the true distribution of the data

  S <- nrow(dat)
  
  MCmean <- apply(dat,2,mean) 
  MCbias <- MCmean-trueval 
  MCrelbias <- MCbias/trueval
  MCstddev <- sqrt(apply(dat,2,var)) 
  MCMSE <- apply((dat-trueval)^2,2,mean) 
  MCRE <- MCMSE[1]/MCMSE
  
  sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE,
                  MCRE)
  names <- c("True Value","Number of Simulations","MC Mean","MC Bias","MC Relative Bias",
             "MC Standard Deviation","MC MSE","MC Relative Efficiency")
  ests <- c("Sample mean","Trimmed Mean","Median")
  
  dimnames(sumdat) <- list(names,ests)
  round(sumdat,5)
}

dt <- simsum(summary.sim, mu)
round(dt, 3)

##                        Sample mean Trimmed Mean   Median
## True Value                   1.000        1.000    1.000
## Number of Simulations     1000.000     1000.000 1000.000
## MC Mean                      1.007        1.008    1.022
## MC Bias                      0.007        0.008    0.022
## MC Relative Bias             0.007        0.008    0.022
## MC Standard Deviation        0.330        0.350    0.398
## MC MSE                       0.109        0.123    0.159
## MC Relative Efficiency       1.000        0.887    0.685

dt2 <- round(dt, 3)
dt2 %>%
  kbl(caption = "Normal Distribution") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Normal Distribution
	Sample mean	Trimmed Mean	Median
True Value	1.000	1.000	1.000
Number of Simulations	1000.000	1000.000	1000.000
MC Mean	1.007	1.008	1.022
MC Bias	0.007	0.008	0.022
MC Relative Bias	0.007	0.008	0.022
MC Standard Deviation	0.330	0.350	0.398
MC MSE	0.109	0.123	0.159
MC Relative Efficiency	1.000	0.887	0.685

Normal Dist. Solution:

Here, we can see that the sample mean is a better estimator than the trimmed mean and median. It is known we want the least biased estimator, and it is shown that the sample mean has the smallest bias in comparison to the trimmed and median. Along with having the smallest bias, it also has the smallest MSE, smallest standard deviation, and it is the most efficient (closest to or at 1). Thus the sample mean is closest to the true mean and is the most accurate normal distribution estimator.

Gamma Distribution:

set.seed(1234)
s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)

generate.gamma <- function(S,n,mu,sigma){ #Function to generate S data sets of size n from gamma distribution with mean mu, variance sigma^2 mu^2
  
  a <- 1/(sigma^2) 
  s <- mu/a
  
  dat <- matrix(rgamma(n*S,shape=a,scale=s),ncol=n,byrow=T) 
  out <- list(dat=dat)
  return(out)
}

trimmean <- function(Y){mean(Y,0.2)}
out <- generate.gamma(s,n,mu,sigma)

outsampmean <- apply(out$dat, 1, mean) 
outtrimmean <- apply(out$dat, 1, trimmean)
outmedian <- apply(out$dat, 1, median) 

summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean,
                          median=outmedian)
simsum <- function(dat, trueval){ 

  S <- nrow(dat)
  
  MCmean <- apply(dat,2,mean) 
  MCbias <- MCmean-trueval 
  MCrelbias <- MCbias/trueval
  MCstddev <- sqrt(apply(dat,2,var)) 
  MCMSE <- apply((dat-trueval)^2,2,mean) 
  MCRE <- MCMSE[1]/MCMSE
  
  sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE,
                  MCRE)
  names <- c("True Value","Number of Simulations","MC Mean","MC Bias","MC Relative Bias",
             "MC Standard Deviation","MC MSE","MC Relative Efficiency")
  ests <- c("Sample mean","Trimmed Mean","Median")
  
  dimnames(sumdat) <- list(names,ests)
  round(sumdat,5)
}

dt <- simsum(summary.sim, mu)
round(dt, 3)

##                        Sample mean Trimmed Mean   Median
## True Value                   1.000        1.000    1.000
## Number of Simulations     1000.000     1000.000 1000.000
## MC Mean                      1.007        0.691    0.587
## MC Bias                      0.007       -0.309   -0.413
## MC Relative Bias             0.007       -0.309   -0.413
## MC Standard Deviation        0.335        0.276    0.303
## MC MSE                       0.112        0.172    0.262
## MC Relative Efficiency       1.000        0.652    0.426

dt2 <- round(dt, 3)
dt2 %>%
  kbl(caption = "Gamma Distribution") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Gamma Distribution
	Sample mean	Trimmed Mean	Median
True Value	1.000	1.000	1.000
Number of Simulations	1000.000	1000.000	1000.000
MC Mean	1.007	0.691	0.587
MC Bias	0.007	-0.309	-0.413
MC Relative Bias	0.007	-0.309	-0.413
MC Standard Deviation	0.335	0.276	0.303
MC MSE	0.112	0.172	0.262
MC Relative Efficiency	1.000	0.652	0.426

Gamma Dist. Solution:

Again, here the sample mean is the closest MC mean, smallest bias, most efficient, and smallest MSE. Thus the sample mean is closest to the true mean and is the most accurate gamma distribution estimator in comparison to the 20% trimmed mean and median.

t-Distribution:

set.seed(1234) 
S <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)

generate.t <- function(S,n,mu,df){
  
  dat <- matrix(mu + rt(n*S,df),ncol=n,byrow=T)
  out <- list(dat=dat)
  return(out)
}

trimmean <- function(Y){mean(Y,0.2)} 
out <- generate.t(S,n,mu,5)

outsampmean <- apply(out$dat,1,mean)
outtrimmean <- apply(out$dat,1,trimmean)
outmedian <- apply(out$dat,1,median)

summary.sim <- data.frame(mean=outsampmean,trim=outtrimmean, median=outmedian)

simsum2 <- function(dat,trueval){ S <- nrow(dat)

MCmean <- apply(dat,2,mean)
MCbias <- MCmean-trueval
MCrelbias <- MCbias/trueval 
MCstddev <- sqrt(apply(dat,2,var))
MCMSE <- apply((dat-trueval)^2,2,mean) 
MCRE <- MCMSE[1]/MCMSE

sumdat <- rbind(rep(trueval,3),S,MCmean,MCbias,MCrelbias,MCstddev,MCMSE, MCRE)

names <- c("True Value","Number of Simulations","MC Mean","MC Bias","MC Relative Bias",
             "MC Standard Deviation","MC MSE","MC Relative Efficiency")

ests <- c("Sample mean","Trimmed mean","Median")

dimnames(sumdat) <- list(names,ests)
round(sumdat,5) 
}

dt <- simsum(summary.sim, mu)
round(dt, 3)

##                        Sample mean Trimmed Mean   Median
## True Value                   1.000        1.000    1.000
## Number of Simulations     1000.000     1000.000 1000.000
## MC Mean                      0.989        0.998    1.006
## MC Bias                     -0.011       -0.002    0.006
## MC Relative Bias            -0.011       -0.002    0.006
## MC Standard Deviation        0.333        0.308    0.348
## MC MSE                       0.111        0.095    0.121
## MC Relative Efficiency       1.000        1.165    0.915

dt2 <- round(dt, 3)
dt2 %>%
  kbl(caption = "t-Distribution") %>%
  kable_classic(full_width = F, html_font = "Cambria")

t-Distribution
	Sample mean	Trimmed Mean	Median
True Value	1.000	1.000	1.000
Number of Simulations	1000.000	1000.000	1000.000
MC Mean	0.989	0.998	1.006
MC Bias	-0.011	-0.002	0.006
MC Relative Bias	-0.011	-0.002	0.006
MC Standard Deviation	0.333	0.308	0.348
MC MSE	0.111	0.095	0.121
MC Relative Efficiency	1.000	1.165	0.915

t-Dist. Solution:

Here, the trimmed mean is the closest MC mean, smallest bias, most efficient, and smallest MSE. Thus, the trimmed mean is closest to the true mean and is the most accurate t-distribution estimator in comparison to the sample mean and median.

Example 6.1 From textbook:

This is an illustration of basic Monte Carlo estimation. We generate a sample of size 2 from normal distribution and target parameter is the absolute value of difference in means.

    set.seed(1234)
    mu <- 1
    m <- 1000
    g <- numeric(m)
    MSE.Num <-numeric(m)
    
    for (i in 1:m) {
        x <- rnorm(2)
        g[i] <- abs(x[1] - x[2])
        MSE.Num[i]<-(g[i]-mu)^2 # MSE calculation of the numerator
       
    }
    
    est <- mean(g)
    bias.g<-est-0 # Monte Carlo Bias, note mu ==0

MC.SE<-sd(g) # MC Standard error
var.theor<-2-(4/pi) # theoretical variance
MC.MSE<-sum(MSE.Num)/m # Monte Carlo MSE

dt <- round(rbind(bias.g, est, var.theor, MC.SE, MC.MSE),3)

names <- c("Bias", "Mean Difference", "Theoretical Variance", "MC Standard Error", "MC MSE")

dimnames(dt) <- list(names)

dt %>%
  kbl(caption = "Monte Carlo Values") %>%
  kable_classic(full_width = F, html_font = "Cambria")

Monte Carlo Values
Bias	1.097
Mean Difference	1.097
Theoretical Variance	0.727
MC Standard Error	0.834
MC MSE	0.704