In this document, I play with population versus sample mean and variance to get a deeper understanding of confidence intervals and hypothesis testing which later play an important role in regression analysis which plays an important role in machine learning/predictive analysis.
I would like to study specific models such as the six sided die, random numbers from a normal, binomial and poisson distribution.
# maximum population size
nmax <- 1000
# initialize population mean vector
popmean <- rep(0, times = nmax)
# compute population means of 6 sided die rolls in a loop
for (popsize in 1:nmax) {
# randomly generate die rolls
population <- sample(1:6, popsize, replace = TRUE)
# compute mean
popmean[popsize] <- mean(population)
}
# construct data frame of population mean data
meandf <- data.frame(1:nmax, popmean)
names(meandf) <- c("popsize", "popmean")
library(ggplot2)
# plot of population mean as a function of population size
g <- ggplot(meandf, aes(x = popsize, y = popmean)) +
geom_line(lwd = 1, colour = "black") +
xlab("Population size") +
ylab("Population mean")
# print plot
g
Now let us look at the distribution of sample means from a population. To begin, use a small population size of 10. Then, construct a histogram of mean values of permutations of samples. Before that, we briefly look at how many of each outcomes there are.
# population size - keep it small for now
popsize <- 10
# generate 10 random rolls of a 6-sided balanced die
randpop <- sample(1:6, popsize, replace = TRUE)
# construct data frame of roll # vs outcome
popdf <- data.frame(1:popsize, randpop)
names(popdf) <- c("runnum", "outcome")
# histogram of population per outcome
g <- ggplot(popdf, aes(outcome)) +
geom_histogram() +
ylab("Probability times population size")
# print plot
g
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Next we consider the all the possible samples for pre-specified sample sizes (2, 4, 6, 8, 9) and the corresponding means and variances in comparison the population means and variances in a histogram.
library(gtools)
library(reshape2)
# set sample sizes of interest
sampsize <- c(2, 4, 6, 8, 9)
# Loop through various sample sizes
for(sampnum in 1:length(sampsize)) {
# generate combinations of all possible sampes of prespecified size
permsample <- combinations(n = popsize, r = sampsize[sampnum],
v=randpop, set = FALSE, repeats.allowed = FALSE)
# take mean of each sample, i.e. mean of the row of the matrix representing the samples
samplemeans <- apply(permsample, 1, mean)
# take the variance of the row of the matrix representing the samples
samplevar <- apply(permsample, 1, var)
# construct a data frame with the columns representing sample number, corresponding
# mean and variance
sampledf <- data.frame(1:length(samplemeans), samplemeans, samplevar)
names(sampledf) <- c("sampnum", "sampmean", "sampvar")
# change the data frame to long format for the sake of ggplot
samplelong <- melt(sampledf, id.vars = c("sampnum"))
# data frame containing vertical lines corresponding to (population/sample) mean and
# variance and the text describing the vertical lines.
dat.vline <- data.frame(variable = c("sampmean", "sampvar"), value = c(mean(randpop),
var(randpop)))
dat.text <- data.frame(variable = c("sampmean", "sampvar"), x = c(mean(randpop),
var(randpop)), y = c(0, 0), label = c("Population mean", "Population variance"))
# construct ggplot
g <- ggplot(samplelong, aes(value)) +
geom_histogram() +
facet_grid(.~ variable) +
geom_vline(aes(xintercept = value), data = dat.vline, colour = c("blue", "red"),
lty = c(1, 2), lwd = c(1.5, 1.5)) +
geom_text(aes(x, y, label = label), data = dat.text) +
ggtitle(paste("(6 sided die) Sample mean distribution: population size = ",
popsize, ", sample size = ", sampsize[sampnum]))
# print plot
print(g)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
popsize <- 20 # size of population
randpop <- sample(1:3, popsize, replace = TRUE) # generate 10 random die rolls
# histogram of population
popdf <- data.frame(1:popsize, randpop)
names(popdf) <- c("runnum", "outcome")
ggplot(popdf, aes(outcome)) + geom_histogram() + ylab("Probability times population size")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# set sample sizes of interest
sampsize <- c(4, 8, 12, 16, 20)
# Loop through various sample sizes
for(sampnum in 1:length(sampsize)) {
# generate combinations of all possible sampes of prespecified size
permsample <- combinations(n = popsize, r = sampsize[sampnum],
v=randpop, set = FALSE, repeats.allowed = FALSE)
# take mean of each sample, i.e. mean of the row of the matrix representing the samples
samplemeans <- apply(permsample, 1, mean)
# take the variance of the row of the matrix representing the samples
samplevar <- apply(permsample, 1, var)
# construct a data frame with the columns representing sample number, corresponding
# mean and variance
sampledf <- data.frame(1:length(samplemeans), samplemeans, samplevar)
names(sampledf) <- c("sampnum", "sampmean", "sampvar")
# change the data frame to long format for the sake of ggplot
samplelong <- melt(sampledf, id.vars = c("sampnum"))
# data frame containing vertical lines corresponding to (population/sample) mean and
# variance and the text describing the vertical lines.
dat.vline <- data.frame(variable = c("sampmean", "sampvar"), value = c(mean(randpop),
var(randpop)))
dat.text <- data.frame(variable = c("sampmean", "sampvar"), x = c(mean(randpop),
var(randpop)), y = c(0, 0), label = c("Population mean", "Population variance"))
# construct ggplot
g <- ggplot(samplelong, aes(value)) +
geom_histogram() +
facet_grid(.~ variable) +
geom_vline(aes(xintercept = value), data = dat.vline, colour = c("blue", "red"),
lty = c(1, 2), lwd = c(1.5, 1.5)) +
geom_text(aes(x, y, label = label), data = dat.text) +
ggtitle(paste("(3 sided die) Sample mean distribution: population size = ",
popsize, ", sample size = ", sampsize[sampnum]))
# print plot
print(g)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The six sided fair die has six equiprobable outcomes. Hence, the smaller population size of 10 has a lower likelihood of not having outcomes in proportion to its corresponding probability.
Hence, in the population size of 10 plots we see that the data need not necessarily look very normal. Sometimes it does, sometimes it doesn’t depending on the run.
Furthermore, for small populations and sample sizes we see a lot of variability in the data. In other words, inclusion or exclusion of a point in the sample size is more likely to have a larger effect on the mean.
I think the following statement has a mathematical proof: The population mean and the means of all possible sample means are equal. From the central limit theorem, for large population sizes (in relation to the number of possible outcomes), the distribution of sample means become more and more of a normal distribution.
For a small population size (in relation to the number of possible outcomes), the distribution of sample means is closer to Students T distribution, which has a fatter tail than the normals distribution.
Having gained some intuition into the means of the samples, let us next look at the variances.
nosim <- 10000
dat <- data.frame(
x = c(sample(1:6, nosim, replace = TRUE),
apply(matrix(sample(1:6, nosim * 2, replace = TRUE),
nosim), 1, mean),
apply(matrix(sample(1:6, nosim * 3, replace = TRUE),
nosim), 1, mean),
apply(matrix(sample(1:6, nosim * 4, replace = TRUE),
nosim), 1, mean)
),
size = factor(rep(1:4, rep(nosim, 4))))
g <- ggplot(dat, aes(x = x, fill = size)) +
geom_histogram(alpha = 0.20, binwidth = 0.25, colour = "black") +
facet_grid(.~ size)
g
dat <- data.frame(
x = c(sample(0 : 1, nosim, replace = TRUE),
apply(matrix(sample(0 : 1, nosim * 10, replace = TRUE),
nosim), 1, mean),
apply(matrix(sample(0 : 1, nosim * 20, replace = TRUE),
nosim), 1, mean),
apply(matrix(sample(0 : 1, nosim * 30, replace = TRUE),
nosim), 1, mean)
),
size = factor(rep(c(1, 10, 20, 30), rep(nosim, 4))))
g <- ggplot(dat, aes(x = x, fill = size)) + geom_histogram(alpha = .20, binwidth = 1 / 12, colour = "black");
g + facet_grid(. ~ size)