library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The goal of this problem is to understand Confidence Intervals using simulations. Recall that saying that you have a 95% confidence interval means that if under repeated sampling, 95% of the confidence interval around the sample means will contain the population mean.
Write a function called ‘margin_error_95’ that takes input a vector of sample values, and outputs the margin of error for a 95% confidence interval.
margin_error_95 <- function(sample_vals, sig){
n <- length(sample_vals)
mar_err <- 1.96*(sig/sqrt(n))
}
Write a function ‘CI_95’ that takes input a vector of sample values, and outputs the 95% confidence interval for this sample. You can use the ‘margin_error_95’ function.
CI_95 <- function(sample_vals, sig){
error <- margin_error_95(sample_vals, sig)
CI <- mean(sample_vals) + c(-1, 1)*error
}
Write a function ‘sample_size_95’ that take input ‘width’ and ‘std_dev’ the desired width of the 95% confidence interval and the population std dev, and outputs the sample size required to get that that width for a 95% confidence interval.
sample_size_95 <- function(width, std_dev){
# width = 2 * error, hence
error <- width / 2
n <- (1.96 * std_dev / error)^2
return(n)
}
Write a function ‘width_95’ that take input ‘n’ and ‘std_dev’ the sample size and the population std dev, and outputs the width of the 95% confidence interval corresponding to this sample size.
width_95 <- function(n, std_dev){
error <- 1.96 * std_dev / sqrt(n)
return(2 * error)
}
mean_inside_CI <- function(samp, pop_mu, pop_sig){
upper <- mean(samp) + margin_error_95(samp, pop_sig)
lower <- mean(samp) - margin_error_95(samp, pop_sig)
return(between(pop_mu, lower, upper))
}
We will simulate the calculation of a 95% confidence interval for a large number of samples of a fixed size from a population distribution whose mean we know.
We will then count the number of intervals that contain the population mean.
Use the replicate function to run a simulation of the following experiment:
set.seed(2020)
B <- 10000
n <- 50
CI_95_exp <- replicate(B,
{samp <- rexp(n, 2) # generate a sample
# check is mean is inside 95% CI
mean_inside_CI(samp, pop_mu = 1 / 2 , pop_sig = 1 / 2)
}
)
mean( CI_95_exp ) #calculate mean CI_95_exp to get empirical expected values
## [1] 0.9498
Use the replicate function to run a simulation of the following experiment:
set.seed(2020)
B <- 10000
n <- 50
# gamma distribution parameters
alpha = 2
beta = 0.4
CI_95_gamma <- replicate(B,
{samp <- rgamma(n, shape = alpha, rate = beta)
#need to calculate mean and sd for gamma distribution from its parameters
mean_inside_CI(samp, pop_mu = alpha / beta, pop_sig = sqrt(alpha) / beta )
}
)
mean(CI_95_gamma)
## [1] 0.9483
In this section we will visualize confidence intervals. Recall that “calculating a 95% confidence interval” means: under repeated sampling 95% of the confidence intervals around the sample mean will contain the population mean.
The following function “plot_CI_95” plots 95%-confidence intervals about 100 sample means coming from samples of size 30 and a normal population distribution with mean 5 and standard deviation 1.2.
Get familiar with the function “plot_CI_95” by evaluating it for a few different seed values.
plot_CI_95 <- function(seed){
B <- 100
n <- 30
mu <- 5
sig <- 1.2
set.seed(seed)
# extract upper bound of CI's
x_1 <- replicate(B,
{samp <- rnorm(n, mean = mu, sd = sig )
max(CI_95(samp, sig))
}
)
#extract lower bound of CI's
set.seed(seed)
x_0 <- replicate(B,
{samp <- rnorm(n, mean = mu, sd = sig )
min(CI_95(samp, sig))
}
)
set.seed(seed)
means <- replicate(B, mean(rnorm(n, mean = mu, sd = sig)))
plot(means, 1:B, pch = 20,
xlim = c(mu - sig, mu + sig),
ylim = c(0,B+1),
xlab = "sample means",
ylab = "index of the CI",
main = paste(B, "Confidence intervals")
)
for (i in 1:B){
if(between(mu, x_0[i], x_1[i])){
segments(x_0[i], i, x_1[i], i, lwd = 2) #plot CI's that contain the mean in black
} else {
segments(x_0[i], i, x_1[i], i, col = "red", lwd = 2) #plot CI's that don't contain the mean in red
}
}
abline(v=mu, col = "blue") #plot a vertical line at the population mean
}
Now use the “Plot_CI_95” function to plot at six different values of the random seed. What do you observe?
seeds <- c(2020, 42, 911, 14, 101, 34) #choose six different seed values
par(mfrow = c(1,2))
for (i in seeds) {
plot_CI_95(i)
}
Suppose we are given a population distribution with mean \(\mu\) and variance \(\sigma^2\). Consider the following two estimators for the population variance \(\sigma^2\):
\[ \theta_1 = \frac{\sum_{i=1}^n(x_i - \overline{x})^2}{n-1}.\]
\[ \theta_2 = \frac{\sum_{i=1}^n(x_i - \overline{x})^2}{n}.\]
Write two functions “theta_1” and “theta_2” that take input a sample of data values and the population mean and outputs the values of the function \(\theta_1\) and \(\theta_2\) respectively.
theta_1 <- function(samp){
return(sum((samp-mean(samp))^2)/(length(samp)-1))
}
theta_2 <- function(samp){
return(sum((samp - mean(samp))^2) / (length(samp)))
}
Suppose we are working with samples of size 10 coming from a normal distribution \(N(\mu = 6, \sigma = 1.6)\). Run a simulation to calculate the empirical expected values of the estimators \(\theta_1\) and \(\theta_2\).
B <- 100000
sig <- 1.6
theta_1_values <- replicate(B,
{samp <- rnorm(10, mean = 6, sd = 1.6)
theta_1(samp)
}
)
theta_2_values <- replicate(B,
{samp <- rnorm(10, mean = 6, sd = 1.6)
theta_2(samp)
}
)
Print the empirical expected values for the two estimators.
mean(theta_1_values)
## [1] 2.554226
mean(theta_2_values)
## [1] 2.308749
sig^2
## [1] 2.56
Use the empirical expected values of \(\theta_1\) and \(\theta_2\) to calculate the empirical bias of these estimators.
bias_theta_1 <- abs(mean(theta_1_values) - sig^2)
bias_theta_2 <- abs(mean(theta_2_values) - sig^2)
bias_theta_1
## [1] 0.00577384
bias_theta_2
## [1] 0.2512512
What conclusions can you draw about the unbiasedness of these estimators?
YOUR ANSWER \(\theta_1\) is unbiased, while \(\theta_2\) is biased.
Now, suppose we are working with samples of size 500 coming from a normal distribution \(N(\mu = 6, \sigma^2 = 1.6)\). Run a simulation to calculate the empirical expected values of the estimators \(\theta_1\) and \(\theta_2\).
theta_1_values <- replicate(B,
{samp <- rnorm(500, mean=6, sd = sig)
theta_1(samp)
}
)
theta_2_values <- replicate(B,
{samp <- rnorm(500, mean=6, sd = sig)
theta_2(samp)
}
)
mean(theta_1_values) #empirical expected value of theta_1
## [1] 2.560173
mean(theta_2_values)#empirical expected value of theta_2
## [1] 2.554193
sig^2 #theoretical value of pop variance
## [1] 2.56
Use the empirical expected values of \(\theta_1\) and \(\theta_2\) to calculate the empirical bias of these estimators.
bias_theta_1 <- abs(mean(theta_1_values) - sig^2)
bias_theta_2 <- abs(mean(theta_2_values) - sig^2)
bias_theta_1 #bias for theta_1
## [1] 0.0001725795
bias_theta_2 #bias for theta_2
## [1] 0.005807232
What conclusions can you draw about the unbiasedness of these estimators when sample size is large?
Large sizes reduce the bias of \(\theta_2\) estimator to the point of it being insignificant. Still, we can see that \(theta_1\) is lass biased than \(theta_2\).
Suppose we are working with samples of size 10 coming from a Gamma distribution with shape parameter \(\alpha = 2\) and scale parameter \(\beta = 0.5\). Run a simulation to calculate the empirical expected values of the estimators \(\theta_1\) and \(\theta_2\).
alpha = 2
beta = 0.5
theta_1_values <- replicate(B,
{samp <- rgamma(10, shape = alpha, rate = beta)
theta_1(samp)
}
)
theta_2_values <- replicate(B,
{samp <- rgamma(10, shape = alpha, rate = beta)
theta_2(samp)
}
)
Use the empirical expected values of \(\theta_1\) and \(\theta_2\) to calculate the empirical bias of these estimators.
bias_theta_1 <- abs(mean(theta_1_values) - alpha / beta^2)
bias_theta_2 <- abs(mean(theta_2_values) - alpha / beta^2)
bias_theta_1 #bias for theta_1
## [1] 0.007121135
bias_theta_2 #bias for theta_2
## [1] 0.8013671
What conclusions can you draw about the unbiasedness of these estimators?
YOUR ANSWER \(\theta_2\) is much more biased for Gamma distribution. \(\theta_1\) is still unbiased.
Now, suppose we are working with samples of size 500 coming from a Gamma distribution with shape parameter \(\alpha = 2\) and scale parameter \(\beta = 0.5\). Run a simulation to calulate the empirical expected values of the estimators \(\theta_1\) and \(\theta_2\).
alpha = 2
beta = 0.5
theta_1_values <- replicate(B,
{samp <- rgamma(500, shape = alpha, rate = beta)
theta_1(samp)
}
)
theta_2_values <- replicate(B,
{samp <- rgamma(500, shape = alpha, rate = beta)
theta_2(samp)
}
)
Use the empirical expected values of \(\theta_1\) and \(\theta_2\) to calculate the empirical bias of these estimators.
bias_theta_1 <- abs(mean(theta_1_values) - alpha / beta^2)
bias_theta_2 <- abs(mean(theta_2_values) - alpha / beta^2)
bias_theta_1 #bias for theta_1
## [1] 0.004755056
bias_theta_2 #bias for theta_2
## [1] 0.01556244
What conclusions can you draw about the unbiasedness of these estimators when sample size is large?
Conclusion are similar to the case of normal distribution. Larger size reduces the bias of both estimators, and the bias of \(\theta_1\) is much smaller than that of \(\theta_2\).