In this document, I play with population versus sample mean and variance to get a deeper understanding of confidence intervals and hypothesis testing which later play an important role in regression analysis which plays an important role in machine learning/predictive analysis.

I would like to study specific models such as the six sided die, random numbers from a normal, binomial and poisson distribution.

The six sided die

  1. Generate 10 six sided die rolls.
  2. Compute the population mean.
  3. Compute the average of two die rolls of every pair from population.
  4. Plot the latter result in a histogram with the population mean.
  5. Vary the population and sample size.

How does the population mean change with population size?

# maximum population size
nmax <- 1000 

# initialize population mean vector
popmean <- rep(0, times = nmax) 

# compute population means of 6 sided die rolls in a loop
for (popsize in 1:nmax) {
        
        # randomly generate die rolls
        population <- sample(1:6, popsize, replace = TRUE)
        
        # compute mean
        popmean[popsize] <- mean(population) 
}

# construct data frame of population mean data 
meandf <- data.frame(1:nmax, popmean) 
names(meandf) <- c("popsize", "popmean")

library(ggplot2)

# plot of population mean as a function of population size
g <- ggplot(meandf, aes(x = popsize, y = popmean)) + 
        geom_line(lwd = 1, colour = "black") + 
        xlab("Population size") + 
        ylab("Population mean")

# print plot
g

Now let us look at the distribution of sample means from a population. To begin, use a small population size of 10. Then, construct a histogram of mean values of permutations of samples. Before that, we briefly look at how many of each outcomes there are.

# population size - keep it small for now
popsize <- 10 

# generate 10 random rolls of a 6-sided balanced die
randpop <- sample(1:6, popsize, replace = TRUE) 

# construct data frame of roll # vs outcome
popdf <- data.frame(1:popsize, randpop)
names(popdf) <- c("runnum", "outcome")

# histogram of population per outcome
g <- ggplot(popdf, aes(outcome)) + 
        geom_histogram() + 
        ylab("Probability times population size")

# print plot
g
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Next we consider the all the possible samples for pre-specified sample sizes (2, 4, 6, 8, 9) and the corresponding means and variances in comparison the population means and variances in a histogram.

library(gtools)
library(reshape2)

# set sample sizes of interest
sampsize <- c(2, 4, 6, 8, 9)

# Loop through various sample sizes
for(sampnum in 1:length(sampsize)) {
        
        # generate combinations of all possible sampes of prespecified size
        permsample <- combinations(n = popsize, r = sampsize[sampnum], 
                                   v=randpop, set = FALSE, repeats.allowed = FALSE) 
        
        # take mean of each sample, i.e. mean of the row of the matrix representing the samples
        samplemeans <- apply(permsample, 1, mean) 
        
        # take the variance of the row of the matrix representing the samples
        samplevar <- apply(permsample, 1, var) 
        
        # construct a data frame with the columns representing sample number, corresponding
        # mean and variance
        sampledf <- data.frame(1:length(samplemeans), samplemeans, samplevar)
        names(sampledf) <- c("sampnum", "sampmean", "sampvar") 
        
        # change the data frame to long format for the sake of ggplot
        samplelong <- melt(sampledf, id.vars = c("sampnum"))
        
        # data frame containing vertical lines corresponding to (population/sample) mean and
        # variance and the text describing the vertical lines.
        dat.vline <- data.frame(variable = c("sampmean", "sampvar"), value = c(mean(randpop), 
                                                                          var(randpop)))
        
        dat.text <- data.frame(variable = c("sampmean", "sampvar"), x = c(mean(randpop), 
                     var(randpop)), y = c(0, 0), label = c("Population mean", "Population variance"))
        
        # construct ggplot
        g <- ggplot(samplelong, aes(value)) + 
                geom_histogram() + 
                facet_grid(.~ variable) + 
                geom_vline(aes(xintercept = value), data = dat.vline, colour = c("blue", "red"), 
                           lty = c(1, 2), lwd = c(1.5, 1.5)) +
                geom_text(aes(x, y, label = label), data = dat.text) +
                ggtitle(paste("(6 sided die) Sample mean distribution: population size = ", 
                              popsize, ", sample size = ", sampsize[sampnum]))
        
        # print plot
        print(g)        
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

popsize <- 20 # size of population
randpop <- sample(1:3, popsize, replace = TRUE) # generate 10 random die rolls

# histogram of population
popdf <- data.frame(1:popsize, randpop)
names(popdf) <- c("runnum", "outcome")
ggplot(popdf, aes(outcome)) + geom_histogram() + ylab("Probability times population size")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# set sample sizes of interest
sampsize <- c(4, 8, 12, 16, 20)

# Loop through various sample sizes
for(sampnum in 1:length(sampsize)) {
        
        # generate combinations of all possible sampes of prespecified size
        permsample <- combinations(n = popsize, r = sampsize[sampnum], 
                                   v=randpop, set = FALSE, repeats.allowed = FALSE) 
        
        # take mean of each sample, i.e. mean of the row of the matrix representing the samples
        samplemeans <- apply(permsample, 1, mean) 
        
        # take the variance of the row of the matrix representing the samples
        samplevar <- apply(permsample, 1, var) 
        
        # construct a data frame with the columns representing sample number, corresponding
        # mean and variance
        sampledf <- data.frame(1:length(samplemeans), samplemeans, samplevar)
        names(sampledf) <- c("sampnum", "sampmean", "sampvar") 
        
        # change the data frame to long format for the sake of ggplot
        samplelong <- melt(sampledf, id.vars = c("sampnum"))
        
        # data frame containing vertical lines corresponding to (population/sample) mean and
        # variance and the text describing the vertical lines.
        dat.vline <- data.frame(variable = c("sampmean", "sampvar"), value = c(mean(randpop), 
                                                                          var(randpop)))
        
        dat.text <- data.frame(variable = c("sampmean", "sampvar"), x = c(mean(randpop), 
                     var(randpop)), y = c(0, 0), label = c("Population mean", "Population variance"))
        
        # construct ggplot
        g <- ggplot(samplelong, aes(value)) + 
                geom_histogram() + 
                facet_grid(.~ variable) + 
                geom_vline(aes(xintercept = value), data = dat.vline, colour = c("blue", "red"), 
                           lty = c(1, 2), lwd = c(1.5, 1.5)) +
                geom_text(aes(x, y, label = label), data = dat.text) +
                ggtitle(paste("(3 sided die) Sample mean distribution: population size = ", 
                              popsize, ", sample size = ", sampsize[sampnum]))
        
        # print plot
        print(g)        
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Summary of plots and results

  • The six sided fair die has six equiprobable outcomes. Hence, the smaller population size of 10 has a lower likelihood of not having outcomes in proportion to its corresponding probability.

  • Hence, in the population size of 10 plots we see that the data need not necessarily look very normal. Sometimes it does, sometimes it doesn’t depending on the run.

  • Furthermore, for small populations and sample sizes we see a lot of variability in the data. In other words, inclusion or exclusion of a point in the sample size is more likely to have a larger effect on the mean.

  • I think the following statement has a mathematical proof: The population mean and the means of all possible sample means are equal. From the central limit theorem, for large population sizes (in relation to the number of possible outcomes), the distribution of sample means become more and more of a normal distribution.

  • For a small population size (in relation to the number of possible outcomes), the distribution of sample means is closer to Students T distribution, which has a fatter tail than the normals distribution.

Having gained some intuition into the means of the samples, let us next look at the variances.

In order to learn other ways of studying this, here I note down Brain Caffo’s code for simulating x die rolls

nosim <- 10000
dat <- data.frame(
        x = c(sample(1:6, nosim, replace = TRUE),
              apply(matrix(sample(1:6, nosim * 2, replace = TRUE),
                           nosim), 1, mean),
              apply(matrix(sample(1:6, nosim * 3, replace = TRUE),
                           nosim), 1, mean),
              apply(matrix(sample(1:6, nosim * 4, replace = TRUE),
                           nosim), 1, mean)
              ),
        size = factor(rep(1:4, rep(nosim, 4))))
g <- ggplot(dat, aes(x = x, fill = size)) +
        geom_histogram(alpha = 0.20, binwidth = 0.25, colour = "black") +
        facet_grid(.~ size)
g

dat <- data.frame(
  x = c(sample(0 : 1, nosim, replace = TRUE),
        apply(matrix(sample(0 : 1, nosim * 10, replace = TRUE), 
                     nosim), 1, mean),
        apply(matrix(sample(0 : 1, nosim * 20, replace = TRUE), 
                     nosim), 1, mean),
        apply(matrix(sample(0 : 1, nosim * 30, replace = TRUE), 
                     nosim), 1, mean)
        ),
  size = factor(rep(c(1, 10, 20, 30), rep(nosim, 4))))
g <- ggplot(dat, aes(x = x, fill = size)) + geom_histogram(alpha = .20, binwidth = 1 / 12, colour = "black"); 
g + facet_grid(. ~ size)