#### Illustration of Central Limit Theorem, Uniform distribution
# demo.clt.unif(N, n)
# draws N samples of size n from Uniform(0,1)
# and plots the N means with a normal distribution overlay
demo.clt.unif <- function(N, n) {
# draw sample in a matrix with N columns and n rows
sam <- matrix(runif(N*n, 0, 1), ncol=N);
# calculate the mean of each column
sam.mean <- colMeans(sam)
# the sd of the mean is the SEM
sam.se <- sd(sam.mean)
# calculate the true SEM given the sample size n
true.se <- sqrt((1/12)/n)
# draw a histogram of the means
hist(sam.mean, freq = FALSE, breaks = 25
, main = paste("True SEM =", round(true.se, 4)
, ", Est SEM = ", round( sam.se, 4))
, xlab = paste("n =", n))
# overlay a density curve for the sample means
points(density(sam.mean), type = "l")
# overlay a normal distribution, bold and red
x <- seq(0, 1, length = 1000)
points(x, dnorm(x, mean = 0.5, sd = true.se), type = "l", lwd = 2, col = "red")
# place a rug of points under the plot
rug(sam.mean)
}
Warning message:
In scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
EOF within quoted string
par(mfrow=c(2,2));
demo.clt.unif(10000, 1)
demo.clt.unif(10000, 2)
demo.clt.unif(10000, 6)
demo.clt.unif(10000, 12)
#### Illustration of Central Limit Theorem, Exponential distribution
# demo.clt.exp(N, n) draws N samples of size n from Exponential(1)
# and plots the N means with a normal distribution overlay
demo.clt.exp <- function(N, n) {
# draw sample in a matrix with N columns and n rows
sam <- matrix(rexp(N*n, 1), ncol=N);
# calculate the mean of each column
sam.mean <- colMeans(sam)
# the sd of the mean is the SEM
sam.se <- sd(sam.mean)
# calculate the true SEM given the sample size n
true.se <- sqrt(1/n)
# draw a histogram of the means
hist(sam.mean, freq = FALSE, breaks = 25
, main = paste("True SEM =", round(true.se, 4), ", Est SEM = ", round(sam.se, 4))
, xlab = paste("n =", n))
# overlay a density curve for the sample means
points(density(sam.mean), type = "l")
# overlay a normal distribution, bold and red
x <- seq(0, 5, length = 1000)
points(x, dnorm(x, mean = 1, sd = true.se), type = "l", lwd = 2, col = "red")
# place a rug of points under the plot
rug(sam.mean)
}
par(mfrow=c(2,2));
demo.clt.exp(10000, 1)
demo.clt.exp(10000, 6)
demo.clt.exp(10000, 30)
demo.clt.exp(10000, 100)
# sample from normal distribution
df <- data.frame(x = rnorm(100, mean = 100, sd = 15))
df$z <- scale(df$x) # by default, this performs a z-score transformation
summary(df)
x z.V1
Min. : 72.31 Min. :-1.8105739
1st Qu.: 89.45 1st Qu.:-0.6054972
Median : 96.72 Median :-0.0945894
Mean : 98.06 Mean : 0.0000000
3rd Qu.:107.72 3rd Qu.: 0.6790059
Max. :138.00 Max. : 2.8084950
## x z.V1
## Min. : 39.64 Min. :-3.446123
## 1st Qu.: 90.99 1st Qu.:-0.485300
## Median :100.00 Median : 0.033925
## Mean : 99.41 Mean : 0.000000
## 3rd Qu.:110.72 3rd Qu.: 0.652006
## Max. :132.70 Max. : 1.919736
## ggplot
library(ggplot2)
p1 <- ggplot(df, aes(x = x))
# Histogram with density instead of count on y-axis
p1 <- p1 + geom_histogram(aes(y=..density..))
p1 <- p1 + geom_density(alpha=0.1, fill="white")
p1 <- p1 + geom_rug()
p1 <- p1 + labs(title = "X ~ Normal(100, 15)")
p2 <- ggplot(df, aes(x = z))
# Histogram with density instead of count on y-axis
p2 <- p2 + geom_histogram(aes(y=..density..))
p2 <- p2 + geom_density(alpha=0.1, fill="white")
p2 <- p2 + geom_rug()
p2 <- p2 + labs(title = "Z ~ Normal(0, 1)")
library(gridExtra)
grid.arrange(p1, p2, ncol=1)
The Student’s t-distribution is a family of continuous probability distributions that arises when estimating the mean of a normally distributed population in situations where the sample size is small and population standard deviation is unknown.
#### Normal vs t-distributions with a range of degrees-of-freedom
x <- seq(-8, 8, length = 1000)
par(mfrow=c(1,1))
plot(x, dnorm(x), type = "l", lwd = 2, col = "red"
, main = "Normal (red) vs t-dist with df=1, 2, 6, 12, 30, 100")
points(x, dt(x, 1), type = "l")
points(x, dt(x, 2), type = "l")
points(x, dt(x, 6), type = "l")
points(x, dt(x, 12), type = "l")
points(x, dt(x, 30), type = "l")
points(x, dt(x,100), type = "l")
The procedure is based on the assumptions that the data are a random sample from the population of interest, and that the population frequency curve is normal. In fact, the assumptions are slightly looser than this, the population frequency curve can be anything provided the sample size is large enough that it’s reasonable to assume that the sampling distribution of the mean is normal. #### Assessing assumptions using the bootstrap
#### Visual comparison of whether sampling distribution is close to Normal via Bootstrap
# a function to compare the bootstrap sampling distribution with
# a normal distribution with mean and SEM estimated from the data
bs.one.samp.dist <- function(dat, N = 1e4) {
n <- length(dat);
# resample from data
sam <- matrix(sample(dat, size = N * n, replace = TRUE), ncol=N);
# draw a histogram of the means
sam.mean <- colMeans(sam);
# save par() settings
old.par <- par(no.readonly = TRUE)
# make smaller margins
par(mfrow=c(2,1), mar=c(3,2,2,1), oma=c(1,1,1,1))
# Histogram overlaid with kernel density curve
hist(dat, freq = FALSE, breaks = 6
, main = "Plot of data with smoothed density curve")
points(density(dat), type = "l")
rug(dat)
hist(sam.mean, freq = FALSE, breaks = 25
, main = "Bootstrap sampling distribution of the mean"
, xlab = paste("Data: n =", n
, ", mean =", signif(mean(dat), digits = 5)
, ", se =", signif(sd(dat)/sqrt(n)), digits = 5))
# overlay a density curve for the sample means
points(density(sam.mean), type = "l")
# overlay a normal distribution, bold and red
x <- seq(min(sam.mean), max(sam.mean), length = 1000)
points(x, dnorm(x, mean = mean(dat), sd = sd(dat)/sqrt(n))
, type = "l", lwd = 2, col = "red")
# place a rug of points under the plot
rug(sam.mean)
# restore par() settings
par(old.par)
}
# example data, skewed --- try others datasets to develop your intuition
x <- rgamma(10, shape = .5, scale = 20)
bs.one.samp.dist(x)
We are interested in testing H[0]: µ = 50 against H[A]: µ != 50,
#### Example: Age at First Transplant
# enter data as a vector
age <- c(54, 42, 51, 54, 49, 56, 33, 58, 54, 64, 49)
par(mfrow=c(2,1))
# Histogram overlaid with kernel density curve
hist(age, freq = FALSE, breaks = 6)
points(density(age), type = "l")
rug(age)
# violin plot
library(vioplot)
vioplot(age, horizontal=TRUE, col="gray")
# stem-and-leaf plot
# stem(age, scale=2)
# t.crit
qt(1 - 0.05/2, df = length(age) - 1)
[1] 2.228139
# defaults include: alternative = "two.sided", conf.level = 0.95
t.summary <- t.test(age, mu = 50)
t.summary
One Sample t-test
data: age
t = 0.51107, df = 10, p-value = 0.6204
alternative hypothesis: true mean is not equal to 50
95 percent confidence interval:
45.72397 56.82149
sample estimates:
mean of x
51.27273
names(t.summary)
[1] "statistic" "parameter" "p.value" "conf.int" "estimate" "null.value"
[7] "alternative" "method" "data.name"
# the assumption of normality of the sampling distribution appears reasonablly close
bs.one.samp.dist(age)
# Function ot plot t-distribution with shaded p-value
t.dist.pval <- function(t.summary) {
par(mfrow=c(1,1))
lim.extreme <- max(4, abs(t.summary$statistic) + 0.5)
lim.lower <- -lim.extreme;
lim.upper <- lim.extreme;
x.curve <- seq(lim.lower, lim.upper, length=200)
y.curve <- dt(x.curve, df = t.summary$parameter)
plot(x.curve, y.curve, type = "n"
, ylab = paste("t-dist( df =", signif(t.summary$parameter, 3), ")")
, xlab = paste("t-stat =", signif(t.summary$statistic, 5)
, ", Shaded area is p-value =", signif(t.summary$p.value, 5)))
if ((t.summary$alternative == "less")
| (t.summary$alternative == "two.sided")) {
x.pval.l <- seq(lim.lower, -abs(t.summary$statistic), length=200)
y.pval.l <- dt(x.pval.l, df = t.summary$parameter)
polygon(c(lim.lower, x.pval.l, -abs(t.summary$statistic))
, c(0, y.pval.l, 0), col="gray")
}
if ((t.summary$alternative == "greater")
| (t.summary$alternative == "two.sided")) {
x.pval.u <- seq(abs(t.summary$statistic), lim.upper, length=200)
y.pval.u <- dt(x.pval.u, df = t.summary$parameter)
polygon(c(abs(t.summary$statistic), x.pval.u, lim.upper)
, c(0, y.pval.u, 0), col="gray")
}
points(x.curve, y.curve, type = "l", lwd = 2, col = "black")
}
# for the age example
t.dist.pval(t.summary)
The power of a test is the probability of correctly rejecting H0 when it is false. Equivalently, power = 1 − Prob( failing to reject H[0] when it is false ) = 1 − Prob( Type-II error ).
#### Power calculations (that you can do on your own)
toco <- c(5.6, 2.7, 6.2, 2.9, 1.5, 4.0, 4.3, 3.0, 3.6, 2.4, 6.7, 3.8)
library(pwr)
power.t.test(n = NULL, delta = 1.00 - 0.54, sd = sd(toco)
, sig.level = 0.05, power = 0.50
, type = "one.sample", alternative = "two.sided")
One-sample t test power calculation
n = 47.4122
delta = 0.46
sd = 1.582552
sig.level = 0.05
power = 0.5
alternative = two.sided
power.t.test(n = NULL, delta = 0.75 - 0.54, sd = sd(toco)
, sig.level = 0.05, power = 0.50
, type = "one.sample", alternative = "two.sided")
One-sample t test power calculation
n = 220.0851
delta = 0.21
sd = 1.582552
sig.level = 0.05
power = 0.5
alternative = two.sided