chap 6 practice

require(latticeExtra)
require(mosaic)
require(Hmisc)
library(kableExtra)
library(knitr)

Example 1:

Simulating Sampling Distribution of Mean for three different sample sizes, n=5, n=25 and n=100 from exponential distribution with mean 5 and visualizing it using boxplots.

set.seed(1234)

lambda<-2
numsim<-100
mean5<-rep(0,numsim)
mean25<-mean5
mean100<-mean5

for (i in 1:numsim){
  mean5[i]<-mean(rexp(5,lambda))
  mean25[i]<-mean(rexp(25,lambda))
  mean100[i]<-mean(rexp(100,lambda))
  
}
boxplot(mean5,mean25,mean100,names=c("n=5","n=25","n=100"))
title("Distribution of Mean of Exponentials")

par(mfcol=c(1,3))
hist(mean5)
lines(density(mean5))
hist(mean25)
lines(density(mean25))
hist(mean100)
lines(density(mean100))

par(mfcol=c(1,1))

a <- c(mean(mean5),mean(mean25),mean(mean100), var(mean5), var(mean25), var(mean100))
tab <- matrix(rep(a, times=1), ncol=3, byrow=TRUE)
colnames(tab) <- c('Mean5', 'Mean25', 'Mean100')
rownames(tab) <- c('Mean', 'Variance')
tab <- as.table(round(tab, digits = 3))
tab %>%
  kbl(caption = "Mean of Variance for 3 Sample Sizes") %>%
  kable_paper("hover", full_width = F)

Mean of Variance for 3 Sample Sizes
	Mean5	Mean25	Mean100
Mean	0.492	0.493	0.496
Variance	0.043	0.008	0.002

Notice how the variance of the estimator is getting smaller with increase in sample size. By CLT, the mean should have normal distribution or at least approximate normal, for large sample sizes, regardless of the orignating distribution from which samples came.

Example 2

Compare three estimators of the mean (μ) of a standard normal distribution. Comparing sample mean, trimmed mean and medians estimators of population mean. If the distribution of data is symmetric (as Standard Normal distribution of our example) all three estimators indeed estimate population mean but if the population from which sample is coming from is skewed then they do not.

set.seed(1234567)

s <- 3 #number of rows
n <- 5 #number of columns

mymatrix <- matrix(rnorm(s*n), nrow = 3, ncol = 5)

#need to add three columns for sample mean, trimmed mean, median

Sample.Mean <- numeric(s) 
Trimmed.Mean <- numeric(s) 
Median <- numeric(s)
#Notice there are three estimators (mean, trimmed mean and median) whose properties we are comparing

mydata <- cbind(mymatrix, Sample.Mean, Trimmed.Mean, Median)
mydata[,n+1] <- apply(mydata[,1:n],1,mean) 
#finding mean for each row/sample

trimmean <- function(Y)
{
mean(Y,0.2)
}

mydata[,n+2] <- apply(mydata[,1:n],1,trimmean) 
#finding trim mean for each sample/row

mydata[,n+3] <- apply(mydata[,1:n],1,median) 
#finding median for each sample/row

mydata_round <- round(mydata, digits = 4)

kbl(mydata_round, caption = "Comparing Three Estimators Mu") %>%
  kable_classic() %>%
  add_header_above(c("N(0,1)" = 5, " " = 3))

Comparing Three Estimators Mu
N(0,1)
					Sample.Mean	Trimmed.Mean	Median
0.1567	-1.3508	-1.7781	-0.1577	-0.3782	-0.7016	-0.6289	-0.3782
1.3738	-0.0085	0.9095	1.1020	-0.4986	0.5756	0.6677	0.9095
0.7307	0.3210	-0.9194	1.5416	-0.5413	0.2265	0.1701	0.3210

Example 2 Using functions

Simulation to compare sampling properties of three different estimators for the mean of a distribution based on an iid sample of size n * sample mean * 20% trimmed mean * median. Note we can generate samples from gamma, normal, and t to see the difference in estimators.

###Normal Distribution

set.seed(1234)

s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)

generate.normal <- function(s, n, mu, sigma){
dat <- matrix(rnorm(n*s, mu, sigma), ncol=n, byrow=T)
out <- list(dat=dat)
  return(out)
}
#function to generate S data sets of size n from normal distribution with mean mu and variance sigma^2

trimmean <- function(Y){mean(Y, 0.2)}
#function to compute the 20% trimmed mean

out <- generate.normal(s, n, mu, sigma)
outsamplemean <- apply(out$dat, 1, mean)
outtrimmean <- apply(out$dat, 1, trimmean)
outmedian <- apply(out$dat, 1, median)

summary.sim <- data.frame(mean = outsamplemean, trim = outtrimmean, median = outmedian)

#function to calculate summary statistics across the S data sets 
simsum <- function(dat,trueval){
  
  S <- nrow(dat)
  
  MCmean <- apply(dat,2,mean)
  MCbias <- MCmean-trueval
  MCrelbias <- MCbias/trueval
  MCstddev <- sqrt(apply(dat,2,var))
  MCMSE <- apply((dat-trueval)^2,2,mean)  
  MCRE <- MCMSE[1]/MCMSE
  
  sumdat <- rbind(rep(trueval, 3), S, MCmean, MCbias,               MCrelbias, MCstddev, MCMSE, MCRE)
  
  names <- c("true value", "# sims","MC mean", "MC bias", "MC relative bias", "MC standard deviation", "MC MSE", "MC relative efficiency")
  
  ests <- c("Sample mean", "Trimmed mean", "Median")
  
  dimnames(sumdat) <- list(names, ests)
  round(sumdat, 5)
}

ntab <- round(simsum(summary.sim, mu), 3)
ntab %>%
  kbl(caption = "Normal Distribution") %>%
  kable_paper("hover", full_width = F)

Normal Distribution
	Sample mean	Trimmed mean	Median
true value	1.000	1.000	1.000
# sims	1000.000	1000.000	1000.000
MC mean	1.007	1.008	1.022
MC bias	0.007	0.008	0.022
MC relative bias	0.007	0.008	0.022
MC standard deviation	0.330	0.350	0.398
MC MSE	0.109	0.123	0.159
MC relative efficiency	1.000	0.887	0.685

Gamma Distribution:

set.seed(1234)

s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)

generate.gamma <- function(s, n, mu, sigma){
  a <- 1/(sigma^2)
  b <- mu/a
  dat1 <- matrix(rgamma(n*s, shape = a, scale = b),               ncol=n, byrow=T)
  out1 <- list(dat1=dat1)
  return(out1)
}

trimmean1 <- function(Y){mean(Y, 0.2)}

out1 <- generate.gamma(s, n, mu, sigma)
outsamplemean1 <- apply(out1$dat1, 1, mean)
outtrimmean1 <- apply(out1$dat1, 1, trimmean)
outmedian1 <- apply(out1$dat1, 1, median)

summary.sim1 <- data.frame(mean = outsamplemean1, trim = outtrimmean1, median = outmedian1)

#function to calculate summary statistics across the S data sets 
simsum1 <- function(dat1,trueval){
  
  S1 <- nrow(dat1)
  
  MCmean1 <- apply(dat1,2,mean)
  MCbias1 <- MCmean1-trueval
  MCrelbias1 <- MCbias1/trueval
  MCstddev1 <- sqrt(apply(dat1,2,var))
  MCMSE1 <- apply((dat1-trueval)^2,2,mean)  
  MCRE1 <- MCMSE1[1]/MCMSE1
  
  sumdat1 <- rbind(rep(trueval, 3), S1, MCmean1, MCbias1, MCrelbias1, MCstddev1, MCMSE1, MCRE1)
  
  names1 <- c("true value", "# sims","MC mean", "MC bias", "MC relative bias", "MC standard deviation", "MC MSE", "MC relative efficiency")
  
  ests1 <- c("Sample mean", "Trimmed mean", "Median")
  
  dimnames(sumdat1) <- list(names1, ests1)
  round(sumdat1, 5)
}

gtab <- round(simsum1(summary.sim1, mu), 3)
gtab %>%
  kbl(caption = "Gamma Distribution") %>%
  kable_paper("hover", full_width = F)

Gamma Distribution
	Sample mean	Trimmed mean	Median
true value	1.000	1.000	1.000
# sims	1000.000	1000.000	1000.000
MC mean	1.007	0.691	0.587
MC bias	0.007	-0.309	-0.413
MC relative bias	0.007	-0.309	-0.413
MC standard deviation	0.335	0.276	0.303
MC MSE	0.112	0.172	0.262
MC relative efficiency	1.000	0.652	0.426

t - Distribution

set.seed(1234)

s <- 1000
n <- 15
mu <- 1
sigma <- sqrt(5/3)

generate.t <- function(s, n, mu, df){
dat2 <- matrix(mu +rt(n*s, df), ncol=n, byrow=T)
out2 <- list(dat2=dat2)
  return(out2)
}

trimmean2 <- function(Y){mean(Y, 0.2)}
#function to compute the 20% trimmed mean

out2 <- generate.t(s, n, mu, 5)
outsamplemean2 <- apply(out2$dat2, 1, mean)
outtrimmean2 <- apply(out2$dat2, 1, trimmean)
outmedian2 <- apply(out2$dat2, 1, median)

summary.sim2 <- data.frame(mean = outsamplemean2, trim = outtrimmean2, median = outmedian2)

#function to calculate summary statistics across the S data sets 
simsum2 <- function(dat2,trueval){
  
  S2 <- nrow(dat2)
  
  MCmean2 <- apply(dat2,2,mean)
  MCbias2 <- MCmean2-trueval
  MCrelbias2 <- MCbias2/trueval
  MCstddev2 <- sqrt(apply(dat2,2,var))
  MCMSE2 <- apply((dat2-trueval)^2,2,mean)  
  MCRE2 <- MCMSE2[1]/MCMSE2
  
  sumdat2 <- rbind(rep(trueval, 3), S2, MCmean2, MCbias2, MCrelbias2, MCstddev2, MCMSE2, MCRE2)
  
  names2 <- c("true value", "# sims","MC mean", "MC bias", "MC relative bias", "MC standard deviation", "MC MSE", "MC relative efficiency")
  
  ests2 <- c("Sample mean", "Trimmed mean", "Median")
  
  dimnames(sumdat2) <- list(names2, ests2)
  round(sumdat2, 5)
}

ttab <- round(simsum2(summary.sim2, mu), 3)
ttab %>%
  kbl(caption = "t - Distribution") %>%
  kable_paper("hover", full_width = F)

t - Distribution
	Sample mean	Trimmed mean	Median
true value	1.000	1.000	1.000
# sims	1000.000	1000.000	1000.000
MC mean	0.989	0.998	1.006
MC bias	-0.011	-0.002	0.006
MC relative bias	-0.011	-0.002	0.006
MC standard deviation	0.333	0.308	0.348
MC MSE	0.111	0.095	0.121
MC relative efficiency	1.000	1.165	0.915

Example 6.1 From textbook

This is an illustration of basic Monte Carlo estimation. We generate a sample of size 2 from normal distribution and target parameter is the absolute value of difference in means.

s <- 1000
g <- numeric(s)
for (i in 1:s) {
  x <- rnorm(2,0,1)
  g[i] <- abs(x[1] - x[2])
}
MCest <- mean(g)
MCsd <- sd(g)
theorvar <- 2 - (4/pi)
MCmse <- sum(g)/s
bias.g <- MCest-0 # Monte Carlo Bias, note mu ==0

tab1 <- matrix(c(MCest, MCsd, MCmse, theorvar, bias.g), ncol = 5, nrow = 1)
colnames(tab1) <- c("MC Estimate", "MC SD", "MC MSE", "Theor Var", "Bias")
rownames(tab1) <- ""
tab1 <- as.table(round(tab1, digits = 3))
tab1 %>%
  kbl(caption = "Monte Carlo Estimation") %>%
  kable_paper("hover", full_width = F)

Monte Carlo Estimation
	MC Estimate	MC SD	MC MSE	Theor Var	Bias
	1.165	0.861	1.165	0.727	1.165

Example 6.4 Confidence interval for variance:

n <- 20
alpha <- .05
x <- rnorm(n, mean=0, sd=2)
UCL <- (n-1) * var(x) / qchisq(alpha, df=n-1)

tucl <- matrix(c(UCL), ncol = 1, nrow = 1)
colnames(tucl) <- c("Upper Confidence Limit")
rownames(tucl) <- ""
tucl <- round(tucl, digits = 3)
tcul <- as.table(tucl)

tucl %>%
  kbl(caption = "Upper Confidence Limit") %>%
  kable_paper("hover", full_width = F)

Upper Confidence Limit
	Upper Confidence Limit
	3.658

Example 6.5 MC estimate of confidence level for variance.

n <- 20
alpha <- .05
UCL1 <- replicate(1000, expr = {
    x <- rnorm(n, mean = 0, sd = 2)
    (n-1) * var(x) / qchisq(alpha, df = n-1)
    } )
#count the number of intervals that contain sigma^2=4
sucl <- sum(UCL1 > 4)/1000

#or we can compute the mean to get the confidence level
mucl <- mean(UCL1 > 4)

tabl <- matrix(c(mucl), ncol = 1, nrow = 1)
colnames(tabl) <- c("MC Estimate")
rownames(tabl) <- ""
tabl <- as.table(tabl)

tabl %>%
  kbl(caption = "MC Estimate of Confidence Level for Variance") %>%
  kable_paper("hover", full_width = F)

MC Estimate of Confidence Level for Variance
	MC Estimate
	0.949

set.seed(1234)

n1 <- 10
alpha <- .05
LCL2 <- replicate(10000, expr = {
    x <- rnorm(n1, mean = 2, sd = 1)
    mean(x) - qt((1 - alpha)/2, df = (n1 - 1)) * (sqrt((var(x))/n1))
    } )
UCL2 <- replicate(10000, expr = {
    x <- rnorm(n1, mean = 2, sd = 1)
    mean(x) + qt((1 - alpha)/2, df = (n1 - 1)) * (sqrt((var(x))/n1))
    } )

#or we can compute the mean to get the confidence level
muc2 <- mean(UCL2 > 2)
mlc2 <- mean(LCL2 < 2)
mci <- muc2 + mlc2

tab2 <- matrix(c(mci), ncol = 1, nrow = 1)
colnames(tab2) <- c("MC Estimate")
rownames(tab2) <- ""
tab2 <- as.table(tab2)

tab2 %>%
  kbl(caption = "MC Estimate of Confidence Level for Variance") %>%
  kable_paper("hover", full_width = F)

MC Estimate of Confidence Level for Variance
	MC Estimate
	0.951

Example 6.7:

n <- 20
alpha <- .05
mu0 <- 500
sigma <- 100

m <- 10000
p <- numeric(m)
for (j in 1:m) {
  x <-rnorm(n, mu0, sigma)
  ttest <- t.test(x, alternative = "greater", mu = mu0)
  p[j] <- ttest$p.value
}

p.hat <- mean(p < alpha)
se.hat <- sqrt(p.hat * (1 - p.hat)/m)

tab3 <- matrix(c(round(p.hat, 3), round(se.hat, 3)), ncol = 2, nrow = 1)
colnames(tab3) <- c("Alpha Hat", "SE")
rownames(tab3) <- ""
tab3 <- as.table(tab3)

tab3 %>%
  kbl(caption = "Empirical Type I Error") %>%
  kable_paper("hover", full_width = F)

Empirical Type I Error
	Alpha Hat	SE
	0.047	0.002

set.seed(123456)
n <- 20
alpha <- .05
mu0 <- 500
sigma <- 100

m <- 1000
p2 <- numeric(m)
for (k in 1:m){
  x2 <- rchisq(n, df = mu0)
  ttest2 <- t.test(x2, alternative = "greater", mu = mu0)
  p2[k] <- ttest2$p.value
}

p.hat2 <- mean(p2 < alpha)
se.hat2 <- sqrt(p.hat2 * (1-p.hat2) / m)

ctabl <- matrix(c(round(p.hat2, 3), round(se.hat2, 3)), ncol = 2, nrow = 1)
colnames(ctabl) <- c("Alpha Hat MC", "SE Hat")
rownames(ctabl) <- ""
ctabl <- as.table(ctabl)

ctabl %>%
  kbl(caption = "Empirical Type I Error Chi-Square") %>%
  kable_paper("hover", full_width = F)

Empirical Type I Error Chi-Square
	Alpha Hat MC	SE Hat
	0.037	0.006

Example 6.9:

n <- 20
h <- 1000
mu0 <- 500
sigma <- 100
mu <- c(seq(450, 650, 10))
M <- length(mu)
power <- numeric(M)
for (i in 1:M) {
  mu1 <- mu[i]
  pvalues <- replicate(h, expr = {
    x3 <- rnorm(n, mean = mu1, sd = sigma)
    ttest3 <- t.test(x3, alternative = "greater", mu = mu0)
    ttest3$p.value
  })
  power[i] <- mean(pvalues <= .05)
}

plot(mu, power)
abline(v = mu0, lty = 1)
abline(h = .05, lty = 1)