An implementation of the “Simulating Variance Estimates” by ProfessorParris on youTube

Population variance is calculated:

\(\sigma = \frac{SS}{n}\)

But for sample standard deviations this formula systematically underestimates the variance and we use the sample standard deviation:

\(s = \frac{SS}{n - 1}\)

The following analysis will simulate repeated sampling from a normal distribution with a known mean (\(\mu\)) and standard deviation (\(\sigma\)) at different sample sizes.

We will then calculate the estimated variance and standard deviation using the population variance formula and the sample variance formula, and compare the results.

library(tidyr)
library(purrr)
library(dplyr)
library(knitr)
sample_data <- function(reps = 10000, mu = 100, sigma = 15, nsamples){
  df <- data.frame("rep" = seq(1:reps), "mu" = mu, "sigma"= sigma, "n" = nsamples) 
  df %>% 
    group_by(rep) %>% 
    do(data = data.frame(value= rnorm(nsamples, mu, sigma))) %>%
    unnest(data) %>% 
    mutate(nsamples = nsamples)
}
df <- bind_rows(sample_data(nsamples = 2),
                sample_data(nsamples = 5),
                sample_data(nsamples = 60),
                sample_data(nsamples = 100))

Then we calcuates the sample mean (\(\bar{X}\)), Sum of squares (SS), variance and standard deviation using the population estimates (n in denominator) and using the sample estimates (n-1 in denominator).

stats <- df %>%
  group_by(nsamples, rep) %>% 
  summarise(nreps = n(),
              xbar  = mean(value),
              SS    = sum((value - xbar)^2),
              popvariance = SS / nreps,
              samvariance = SS / (nreps - 1),
              sigma_est = sqrt(popvariance),
              s_est     = sqrt(samvariance))
stats

Reformat the variances to show the difference between the average variance calculated using the population and the sample formula.

vars <- stats %>%
  select(nsamples, rep, popvariance, samvariance) %>%
  rename(population = popvariance,
         sample     = samvariance) %>%
  gather("vartype", "variance", 3:4) 
meanvars <- vars %>% 
  group_by(nsamples, vartype) %>% 
  summarise(mean = mean(variance))
meanvars %>% 
  spread(vartype, mean) %>% 
  kable() 
nsamples population sample
2 112 223
5 180 225
60 222 225
100 223 225

Despite the small sample size the sample variance is on average a good estimate of the actual variance (\(\sigma^2 = 15^2 = 225\)).

When we plot the distribution it is clear that the individual estimates of the standard deviation have a large distribution which is highly skewed (especially as smalle sample sizes). The sample variance corrects well (on average) for the underestimation.

plot_variance_histogram <- function(n) {
  means <- meanvars %>% 
    filter(nsamples == n)
  
  vars %>%
    filter(nsamples == n) %>% 
    ggplot(aes(variance)) + 
      geom_histogram(binwidth = 25) +
      facet_grid(vartype~.) +
      scale_x_continuous(breaks = seq(-200, 1400, 200) ) +
      geom_vline(data = means %>% filter(nsamples == n), 
                 aes(xintercept = mean), color= "red", size = 1.1) +
      geom_vline(xintercept = 225, color= "blue") +
      labs(title = paste0("simulation of estimating variance (nsamples = ", n, ")"),
           subtitle= "comparing sample (n-1) and population (n) variance")
}
plot_variance_histogram(n= 5)

plot_variance_histogram(n= 60)

plot_variance_histogram(n= 2)

The sample standard deviation is however still systematially underestimated compared to the actual \(\sigma\) = 15.

stddevs <- stats %>%
  select(nsamples, rep, sigma_est, s_est) %>%
  rename(population = sigma_est,
         sample     = s_est) %>%
  gather("vartype", "stddev", 3:4) 
meansds <- stddevs %>% 
  group_by(nsamples, vartype) %>% 
  summarise(mean = mean(stddev))
meansds %>% 
  kable()
nsamples vartype mean
2 population 8.47
2 sample 11.98
5 population 12.60
5 sample 14.09
60 population 14.83
60 sample 14.95
100 population 14.89
100 sample 14.96
n = 100
means <- means %>%
  filter(nsamples == n)
stddevs %>%
    filter(nsamples == n) %>% 
    ggplot(aes(stddev)) + 
      geom_histogram(binwidth = 0.5) +
      facet_grid(vartype~.) +
      scale_x_continuous(breaks = seq(10, 20, 1) ) +
      geom_vline(data = means %>% filter(nsamples == n), 
                 aes(xintercept = mean), color= "red", size = 1.1) +
      geom_vline(xintercept = 15, color= "blue") +
      labs(title = paste0("simulation of estimating std. dev (nsamples = ", n, ")"),
           subtitle= "comparing sample (n-1) and population (n) sd")

n = 100
stddevs %>%
  filter(nsamples == n) %>% 
  ggplot(aes(stddev)) + 
    geom_histogram(binwidth = 0.05) +
    facet_grid(vartype~.) +
    scale_x_continuous(breaks = seq(10, 20, 1) ) +
    coord_cartesian(xlim = c(14, 16))+
    geom_vline(data = means %>% filter(nsamples == n), 
               aes(xintercept = mean), color= "red", size = 1.1) +
    geom_vline(xintercept = 15, color= "blue") +
    labs(title = paste0("simulation of estimating std. dev (nsamples = ", n, ")"),
         subtitle= "comparing sample (n-1) and population (n) sd")

This demonstrates that while the estimate of the sample mean and sample variance is an unbiased estimator of the population mean (\(\mu\)) population variance (\(\sigma\)). However, sample standard deviation is not an unbiased estimator of the population standard deviation and systematically underestimates the standard deviation.

How well does the standard error estimate the error in the population mean?

As I had all this simulation data I thought that I would have a look at the difference between the errors in the xbar confidence interval (based on the SE mean), using the z distribution and the t-distribution at different sample sizes.

stats <- stats %>%
  mutate(SEmean = s_est / sqrt(nreps),
         ZciLL   = (xbar - 1.96 * SEmean),
         ZciUL   = (xbar + 1.96 * SEmean),
         ZmeanOK = ifelse( (100 > ZciLL) & (100 < ZciUL), 
                          yes = TRUE, no = FALSE ),
         tciLL   = (xbar - qt(p = 0.975, df= (nreps - 1)) * SEmean),
         tciUL   = (xbar + qt(p = 0.975, df= (nreps - 1)) * SEmean),
         tmeanOK = ifelse( (100 > tciLL) & (100 < tciUL), 
                          yes = TRUE, no = FALSE )  )
stats %>%
  group_by(nsamples) %>% 
  summarise(zmean = sum(ZmeanOK) / 10000,
            avgZciRange = mean(ZciUL - ZciLL), 
            tmean = sum(tmeanOK) / 10000,
            avgtciRange = mean(tciUL - tciLL) ) %>% 
  kable()
nsamples zmean avgZciRange tmean avgtciRange
2 0.702 33.20 0.948 215.21
5 0.872 24.69 0.949 34.98
60 0.944 7.57 0.949 7.72
100 0.948 5.87 0.951 5.94
stats %>%   
  ggplot(aes(x= xbar)) + 
    geom_histogram(bins = 100) +
    geom_vline(xintercept = 100, color= "red") +
    facet_grid(nsamples~., scales = "free") +
    labs(title= "Distribution of sample means",
         subtitle= "with different sample sizes")

---
title: "Simulating Variance Estimates"
author: "Aaron M. Saunders"
date: "December 21, 2016"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


An implementation of the "Simulating Variance Estimates" by ProfessorParris on [youTube](https://youtu.be/Jgdt6qKB5Cw)


Population variance is calculated:

$\sigma = \frac{SS}{n}$
 
But for sample standard deviations this formula systematically underestimates the variance and we use the sample standard deviation:

$s = \frac{SS}{n - 1}$

The following analysis will simulate repeated sampling from a normal distribution with a known mean ($\mu$) and standard deviation ($\sigma$) at different sample sizes. 

We will then calculate the estimated variance and standard deviation using the population variance formula and the sample variance formula, and compare the results.


```{r message=FALSE, warning=FALSE, results='hide'}
library(tidyr)
library(purrr)
library(dplyr)
library(knitr)

sample_data <- function(reps = 10000, mu = 100, sigma = 15, nsamples){
  df <- data.frame("rep" = seq(1:reps), "mu" = mu, "sigma"= sigma, "n" = nsamples) 
  df %>% 
    group_by(rep) %>% 
    do(data = data.frame(value= rnorm(nsamples, mu, sigma))) %>%
    unnest(data) %>% 
    mutate(nsamples = nsamples)
}

df <- bind_rows(sample_data(nsamples = 2),
                sample_data(nsamples = 5),
                sample_data(nsamples = 60),
                sample_data(nsamples = 100))
```


Then we calcuates the sample mean ($\bar{X}$), Sum of squares (SS), variance and standard deviation using the population estimates (n in denominator) and using the sample estimates (n-1 in denominator).

```{r}
stats <- df %>%
  group_by(nsamples, rep) %>% 
  summarise(nreps = n(),
              xbar  = mean(value),
              SS    = sum((value - xbar)^2),
              popvariance = SS / nreps,
              samvariance = SS / (nreps - 1),
              sigma_est = sqrt(popvariance),
              s_est     = sqrt(samvariance))
stats

```

Reformat the variances to show the difference between the average variance calculated using the population and the sample formula.

```{r}
vars <- stats %>%
  select(nsamples, rep, popvariance, samvariance) %>%
  rename(population = popvariance,
         sample     = samvariance) %>%
  gather("vartype", "variance", 3:4) 

meanvars <- vars %>% 
  group_by(nsamples, vartype) %>% 
  summarise(mean = mean(variance))

meanvars %>% 
  spread(vartype, mean) %>% 
  kable() 

```

Despite the small sample size the sample variance is on average a good estimate of the actual variance ($\sigma^2 = 15^2 = 225$).

When we plot the distribution it is clear that the individual estimates of the standard deviation have a large distribution which is highly skewed (especially as smalle sample sizes). The sample variance corrects well (on average) for the underestimation.


```{r}
plot_variance_histogram <- function(n) {
  means <- meanvars %>% 
    filter(nsamples == n)
  
  vars %>%
    filter(nsamples == n) %>% 
    ggplot(aes(variance)) + 
      geom_histogram(binwidth = 25) +
      facet_grid(vartype~.) +
      scale_x_continuous(breaks = seq(-200, 1400, 200) ) +
      geom_vline(data = means %>% filter(nsamples == n), 
                 aes(xintercept = mean), color= "red", size = 1.1) +
      geom_vline(xintercept = 225, color= "blue") +
      labs(title = paste0("simulation of estimating variance (nsamples = ", n, ")"),
           subtitle= "comparing sample (n-1) and population (n) variance")
}

plot_variance_histogram(n= 5)
```

```{r}
plot_variance_histogram(n= 60)
```

```{r}
plot_variance_histogram(n= 2)
```

The sample standard deviation is however still systematially underestimated compared to the actual $\sigma$ = 15. 


```{r}
stddevs <- stats %>%
  select(nsamples, rep, sigma_est, s_est) %>%
  rename(population = sigma_est,
         sample     = s_est) %>%
  gather("vartype", "stddev", 3:4) 

meansds <- stddevs %>% 
  group_by(nsamples, vartype) %>% 
  summarise(mean = mean(stddev))

meansds %>% 
  kable()
```


```{r}
n = 100
means <- means %>%
  filter(nsamples == n)

stddevs %>%
    filter(nsamples == n) %>% 
    ggplot(aes(stddev)) + 
      geom_histogram(binwidth = 0.5) +
      facet_grid(vartype~.) +
      scale_x_continuous(breaks = seq(10, 20, 1) ) +
      geom_vline(data = means %>% filter(nsamples == n), 
                 aes(xintercept = mean), color= "red", size = 1.1) +
      geom_vline(xintercept = 15, color= "blue") +
      labs(title = paste0("simulation of estimating std. dev (nsamples = ", n, ")"),
           subtitle= "comparing sample (n-1) and population (n) sd")
```

```{r}
n = 100
stddevs %>%
  filter(nsamples == n) %>% 
  ggplot(aes(stddev)) + 
    geom_histogram(binwidth = 0.05) +
    facet_grid(vartype~.) +
    scale_x_continuous(breaks = seq(10, 20, 1) ) +
    coord_cartesian(xlim = c(14, 16))+
    geom_vline(data = means %>% filter(nsamples == n), 
               aes(xintercept = mean), color= "red", size = 1.1) +
    geom_vline(xintercept = 15, color= "blue") +
    labs(title = paste0("simulation of estimating std. dev (nsamples = ", n, ")"),
         subtitle= "comparing sample (n-1) and population (n) sd")
```

This demonstrates that while the estimate of the sample mean and sample variance is an unbiased estimator of the population mean ($\mu$) population variance ($\sigma$). However, sample standard deviation is not an unbiased estimator of the population standard deviation and systematically underestimates the standard deviation.

## How well does the standard error estimate the error in the population mean?

As I had all this simulation data I thought that I would have a look at the difference between the errors in the xbar confidence interval (based on the SE mean), using the z distribution and the t-distribution at different sample sizes.

```{r}
stats <- stats %>%
  mutate(SEmean = s_est / sqrt(nreps),
         ZciLL   = (xbar - 1.96 * SEmean),
         ZciUL   = (xbar + 1.96 * SEmean),
         ZmeanOK = ifelse( (100 > ZciLL) & (100 < ZciUL), 
                          yes = TRUE, no = FALSE ),
         tciLL   = (xbar - qt(p = 0.975, df= (nreps - 1)) * SEmean),
         tciUL   = (xbar + qt(p = 0.975, df= (nreps - 1)) * SEmean),
         tmeanOK = ifelse( (100 > tciLL) & (100 < tciUL), 
                          yes = TRUE, no = FALSE )  )

stats %>%
  group_by(nsamples) %>% 
  summarise(zmean = sum(ZmeanOK) / 10000,
            avgZciRange = mean(ZciUL - ZciLL), 
            tmean = sum(tmeanOK) / 10000,
            avgtciRange = mean(tciUL - tciLL) ) %>% 
  kable()

```


```{r}
stats %>%   
  ggplot(aes(x= xbar)) + 
    geom_histogram(bins = 100) +
    geom_vline(xintercept = 100, color= "red") +
    facet_grid(nsamples~., scales = "free") +
    labs(title= "Distribution of sample means",
         subtitle= "with different sample sizes")
```

