The purpose of this document is gain an understanding of probablility distributions. This document is based on Jenny's version of the seminar tutorial.
Start by generating data:
n <- 10
B <- 4
set.seed(540) # to obtain deterministic results
x <- matrix(rnorm(n * B), nrow = n)
# check the data
str(x)
## num [1:10, 1:4] -1.615 0.734 2.095 0.501 1.736 ...
head(x)
## [,1] [,2] [,3] [,4]
## [1,] -1.6145 -0.008744 -0.8413 -1.2752
## [2,] 0.7340 0.108303 -1.0581 0.3885
## [3,] 2.0947 0.806000 -0.2552 -0.1950
## [4,] 0.5009 -1.504498 0.3213 1.4259
## [5,] 1.7355 -0.417628 0.7739 -0.6323
## [6,] -2.0584 0.480787 -0.1932 0.6684
Jenny's tutorial goes through other methods for generating data but this the nicest R-like way of doing it.
Give that data row and column names
rownames(x) <- sprintf("obs%02d", 1:n)
colnames(x) <- sprintf("samp%02d", 1:B)
x #check what this looks like
## samp01 samp02 samp03 samp04
## obs01 -1.6145 -0.008744 -0.8413 -1.2752
## obs02 0.7340 0.108303 -1.0581 0.3885
## obs03 2.0947 0.806000 -0.2552 -0.1950
## obs04 0.5009 -1.504498 0.3213 1.4259
## obs05 1.7355 -0.417628 0.7739 -0.6323
## obs06 -2.0584 0.480787 -0.1932 0.6684
## obs07 -1.0339 -0.168417 -1.7138 0.5225
## obs08 1.4914 0.760428 1.2975 -0.2879
## obs09 -0.5068 0.624589 1.0531 -0.3869
## obs10 -0.7001 0.580386 0.3417 0.3060
colMeans(x) #fast way compute the means for all the samples
## samp01 samp02 samp03 samp04
## 0.06427 0.12612 -0.02743 0.05340
mean(x) #true mean
## [1] 0.05409
mean(colMeans(x)) #sample mean
## [1] 0.05409
They seem to be identical. I will try with another seed and n, b value:
n <- 10
B <- 4
set.seed(77) # to obtain deterministic results
x <- matrix(rnorm(n * B), nrow = n)
colMeans(x)
## [1] 0.4016 0.1712 0.1777 -0.3321
mean(x) #true mean
## [1] 0.1046
mean(colMeans(x)) #sample mean
## [1] 0.1046
Same results again. I expected them to be so, given the equal number of elements used to compute each sample mean.
Note SEM mean standard error of the mean
B = 1000
# generate data
n10 <- colMeans(matrix(rnorm(10 * B), nrow = 10))
n100 <- colMeans(matrix(rnorm(100 * B), nrow = 100))
n1000 <- colMeans(matrix(rnorm(1000 * B), nrow = 1000))
n10000 <- colMeans(matrix(rnorm(10000 * B), nrow = 10000))
lawDat <- data.frame(sampSize = c(10, 100, 1000, 10000), trueSEM = 1/sqrt(c(10,
100, 1000, 10000)), obsSEM = c(sd(n10), sd(n100), sd(n1000), sd(n10000)),
sampMeanIQR = c(IQR(n10), IQR(n100), IQR(n1000), IQR(n10000)), sampleMeanMad = c(mad(n10),
mad(n100), mad(n1000), mad(n10000)))
lawDat
## sampSize trueSEM obsSEM sampMeanIQR sampleMeanMad
## 1 10 0.31623 0.31181 0.42774 0.31932
## 2 100 0.10000 0.10034 0.14023 0.10236
## 3 1000 0.03162 0.03212 0.04203 0.03120
## 4 10000 0.01000 0.01013 0.01424 0.01035
# generate data
n10 <- colMeans(matrix(rnorm(10 * B, sd = 2), nrow = 10))
n100 <- colMeans(matrix(rnorm(100 * B, sd = 2), nrow = 100))
n1000 <- colMeans(matrix(rnorm(1000 * B, sd = 2), nrow = 1000))
n10000 <- colMeans(matrix(rnorm(10000 * B, sd = 2), nrow = 10000))
lawDat <- data.frame(sampSize = c(10, 100, 1000, 10000), trueSEM = 1/sqrt(c(10,
100, 1000, 10000)) * 2, obsSEM = c(sd(n10), sd(n100), sd(n1000), sd(n10000)),
sampMeanIQR = c(IQR(n10), IQR(n100), IQR(n1000), IQR(n10000)), sampleMeanMad = c(mad(n10),
mad(n100), mad(n1000), mad(n10000)))
lawDat
## sampSize trueSEM obsSEM sampMeanIQR sampleMeanMad
## 1 10 0.63246 0.62312 0.84635 0.62824
## 2 100 0.20000 0.20339 0.27179 0.20149
## 3 1000 0.06325 0.06283 0.08051 0.05949
## 4 10000 0.02000 0.01942 0.02591 0.01925
normSamp <- rnorm(10000)
# pick a threshold lets say 0.5 sd from the mean
pnorm(0.5)
## [1] 0.6915
length(normSamp[normSamp <= 0.5])/10000
## [1] 0.6784
Pretty close. Lets try another distribution
samp <- rnorm(10000, sd = 2)
# pick a threshold lets say 0.5 sd from the mean
pnorm(0.5, sd = 2)
## [1] 0.5987
length(samp[samp <= 0.5])/10000
## [1] 0.5963
Lets try a smaller sample size
samp <- rnorm(100, sd = 2)
# pick a threshold lets say 0.5 sd from the mean
pnorm(0.5, sd = 2)
## [1] 0.5987
length(samp[samp <= 0.5])/100
## [1] 0.64
It seems to drift away from the true results. Lets try probabilites greater than a threshold.
samp <- rnorm(10000, sd = 2)
# pick a threshold lets say 1.1 sd from the mean
pnorm(1.1, sd = 2, lower.tail = FALSE)
## [1] 0.2912
length(samp[samp > 1.1])/10000
## [1] 0.2898
Now lets try a interval between 2 thresholds.
samp <- rnorm(10000, sd = 2)
# pick a threshold lets say -0.5 and 1.1 sd from the mean
pnorm(1.1, sd = 2) - pnorm(-0.5, sd = 2)
## [1] 0.3075
length(samp[samp <= 1.1 & samp >= -0.5])/10000
## [1] 0.308
Now lets try a interval between 2 thresholds.
samp <- rnorm(10000, sd = 2)
# pick a threshold lets say -0.5 and 1.1 sd from the mean
pnorm(1.1, sd = 2) - pnorm(-0.5, sd = 2)
## [1] 0.3075
length(samp[samp <= 1.1 & samp >= -0.5])/10000
## [1] 0.3054
library(ggplot2)
library(lattice)
B <- 1000
n <- round(10^(seq(from = 1, to = 2.5, length = 4)), 0)
names(n) <- paste0("n", n)
getSampleMeans <- function(n, B) colMeans(matrix(rnorm(n * B), nrow = n))
x <- data.frame(sapply(n, getSampleMeans, B))
jFormula <- as.formula(paste("~", paste(names(n), sep = "", collapse = " + ")))
xTallSkinny <- stack(x)
names(xTallSkinny) <- c("x", "n")
xTallSkinny$n <- factor(xTallSkinny$n, levels = colnames(x))
plot <- ggplot(xTallSkinny, aes(x, group = n, colour = n)) + geom_density()
plot <- plot + geom_point(aes(x = x, y = 0, group = n, colour = n), position = "jitter",
pch = 1)
plot <- plot + xlab("Sample Means")
plot
densityplot(~x, xTallSkinny, xlab = "sample means", groups = n, auto.key = list(x = 0.9,
y = 0.9, corner = c(1, 1), reverse.rows = TRUE))