Stat 540 Seminar 2

The purpose of this document is gain an understanding of probablility distributions. This document is based on Jenny's version of the seminar tutorial.

Start by generating data:

n <- 10
B <- 4

set.seed(540)  # to obtain deterministic results
x <- matrix(rnorm(n * B), nrow = n)

# check the data
str(x)
##  num [1:10, 1:4] -1.615 0.734 2.095 0.501 1.736 ...
head(x)
##         [,1]      [,2]    [,3]    [,4]
## [1,] -1.6145 -0.008744 -0.8413 -1.2752
## [2,]  0.7340  0.108303 -1.0581  0.3885
## [3,]  2.0947  0.806000 -0.2552 -0.1950
## [4,]  0.5009 -1.504498  0.3213  1.4259
## [5,]  1.7355 -0.417628  0.7739 -0.6323
## [6,] -2.0584  0.480787 -0.1932  0.6684

Jenny's tutorial goes through other methods for generating data but this the nicest R-like way of doing it.

Give that data row and column names

rownames(x) <- sprintf("obs%02d", 1:n)
colnames(x) <- sprintf("samp%02d", 1:B)
x  #check what this looks like
##        samp01    samp02  samp03  samp04
## obs01 -1.6145 -0.008744 -0.8413 -1.2752
## obs02  0.7340  0.108303 -1.0581  0.3885
## obs03  2.0947  0.806000 -0.2552 -0.1950
## obs04  0.5009 -1.504498  0.3213  1.4259
## obs05  1.7355 -0.417628  0.7739 -0.6323
## obs06 -2.0584  0.480787 -0.1932  0.6684
## obs07 -1.0339 -0.168417 -1.7138  0.5225
## obs08  1.4914  0.760428  1.2975 -0.2879
## obs09 -0.5068  0.624589  1.0531 -0.3869
## obs10 -0.7001  0.580386  0.3417  0.3060
colMeans(x)  #fast way compute the means for all the samples
##   samp01   samp02   samp03   samp04 
##  0.06427  0.12612 -0.02743  0.05340

Exercise: Sample mean vs true mean

mean(x)  #true mean
## [1] 0.05409
mean(colMeans(x))  #sample mean
## [1] 0.05409

They seem to be identical. I will try with another seed and n, b value:

n <- 10
B <- 4
set.seed(77)  # to obtain deterministic results
x <- matrix(rnorm(n * B), nrow = n)
colMeans(x)
## [1]  0.4016  0.1712  0.1777 -0.3321
mean(x)  #true mean
## [1] 0.1046
mean(colMeans(x))  #sample mean
## [1] 0.1046

Same results again. I expected them to be so, given the equal number of elements used to compute each sample mean.

Exercise: Explore the Weak Law of Large Numbers

Note SEM mean standard error of the mean

B = 1000
# generate data
n10 <- colMeans(matrix(rnorm(10 * B), nrow = 10))
n100 <- colMeans(matrix(rnorm(100 * B), nrow = 100))
n1000 <- colMeans(matrix(rnorm(1000 * B), nrow = 1000))
n10000 <- colMeans(matrix(rnorm(10000 * B), nrow = 10000))

lawDat <- data.frame(sampSize = c(10, 100, 1000, 10000), trueSEM = 1/sqrt(c(10, 
    100, 1000, 10000)), obsSEM = c(sd(n10), sd(n100), sd(n1000), sd(n10000)), 
    sampMeanIQR = c(IQR(n10), IQR(n100), IQR(n1000), IQR(n10000)), sampleMeanMad = c(mad(n10), 
        mad(n100), mad(n1000), mad(n10000)))

lawDat
##   sampSize trueSEM  obsSEM sampMeanIQR sampleMeanMad
## 1       10 0.31623 0.31181     0.42774       0.31932
## 2      100 0.10000 0.10034     0.14023       0.10236
## 3     1000 0.03162 0.03212     0.04203       0.03120
## 4    10000 0.01000 0.01013     0.01424       0.01035

Exercize: Repeat with alternate distribution

# generate data
n10 <- colMeans(matrix(rnorm(10 * B, sd = 2), nrow = 10))
n100 <- colMeans(matrix(rnorm(100 * B, sd = 2), nrow = 100))
n1000 <- colMeans(matrix(rnorm(1000 * B, sd = 2), nrow = 1000))
n10000 <- colMeans(matrix(rnorm(10000 * B, sd = 2), nrow = 10000))

lawDat <- data.frame(sampSize = c(10, 100, 1000, 10000), trueSEM = 1/sqrt(c(10, 
    100, 1000, 10000)) * 2, obsSEM = c(sd(n10), sd(n100), sd(n1000), sd(n10000)), 
    sampMeanIQR = c(IQR(n10), IQR(n100), IQR(n1000), IQR(n10000)), sampleMeanMad = c(mad(n10), 
        mad(n100), mad(n1000), mad(n10000)))

lawDat
##   sampSize trueSEM  obsSEM sampMeanIQR sampleMeanMad
## 1       10 0.63246 0.62312     0.84635       0.62824
## 2      100 0.20000 0.20339     0.27179       0.20149
## 3     1000 0.06325 0.06283     0.08051       0.05949
## 4    10000 0.02000 0.01942     0.02591       0.01925

Generate a large sample from some normal distribution and explore probabilities with respect to relative frequencies

normSamp <- rnorm(10000)
# pick a threshold lets say 0.5 sd from the mean
pnorm(0.5)
## [1] 0.6915
length(normSamp[normSamp <= 0.5])/10000
## [1] 0.6784

Pretty close. Lets try another distribution

samp <- rnorm(10000, sd = 2)
# pick a threshold lets say 0.5 sd from the mean
pnorm(0.5, sd = 2)
## [1] 0.5987
length(samp[samp <= 0.5])/10000
## [1] 0.5963

Lets try a smaller sample size

samp <- rnorm(100, sd = 2)
# pick a threshold lets say 0.5 sd from the mean
pnorm(0.5, sd = 2)
## [1] 0.5987
length(samp[samp <= 0.5])/100
## [1] 0.64

It seems to drift away from the true results. Lets try probabilites greater than a threshold.

samp <- rnorm(10000, sd = 2)
# pick a threshold lets say 1.1 sd from the mean
pnorm(1.1, sd = 2, lower.tail = FALSE)
## [1] 0.2912
length(samp[samp > 1.1])/10000
## [1] 0.2898

Now lets try a interval between 2 thresholds.

samp <- rnorm(10000, sd = 2)
# pick a threshold lets say -0.5 and 1.1 sd from the mean
pnorm(1.1, sd = 2) - pnorm(-0.5, sd = 2)
## [1] 0.3075
length(samp[samp <= 1.1 & samp >= -0.5])/10000
## [1] 0.308

Now lets try a interval between 2 thresholds.

samp <- rnorm(10000, sd = 2)
# pick a threshold lets say -0.5 and 1.1 sd from the mean
pnorm(1.1, sd = 2) - pnorm(-0.5, sd = 2)
## [1] 0.3075
length(samp[samp <= 1.1 & samp >= -0.5])/10000
## [1] 0.3054

Recreation of Jenny's lattice plot with ggplot

library(ggplot2)
library(lattice)

B <- 1000
n <- round(10^(seq(from = 1, to = 2.5, length = 4)), 0)
names(n) <- paste0("n", n)
getSampleMeans <- function(n, B) colMeans(matrix(rnorm(n * B), nrow = n))
x <- data.frame(sapply(n, getSampleMeans, B))
jFormula <- as.formula(paste("~", paste(names(n), sep = "", collapse = " + ")))
xTallSkinny <- stack(x)
names(xTallSkinny) <- c("x", "n")
xTallSkinny$n <- factor(xTallSkinny$n, levels = colnames(x))


plot <- ggplot(xTallSkinny, aes(x, group = n, colour = n)) + geom_density()
plot <- plot + geom_point(aes(x = x, y = 0, group = n, colour = n), position = "jitter", 
    pch = 1)
plot <- plot + xlab("Sample Means")
plot

plot of chunk unnamed-chunk-13


densityplot(~x, xTallSkinny, xlab = "sample means", groups = n, auto.key = list(x = 0.9, 
    y = 0.9, corner = c(1, 1), reverse.rows = TRUE))

plot of chunk unnamed-chunk-13