The Bootstrap
Bootstrapping (statistics) - Wikipedia
Web Apps (artofstat.com)
Example: Bootstrap Confidence Intervals for Library Data
Books<- read.table ("http://stat4ds.rwth-aachen.de/data/Library.dat" , header= TRUE )
head (Books,3 )
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.00 9.00 17.00 21.98 19.00 140.00
boxplot (Books$ P, xlab= "Years since publication" , horizontal= TRUE )
library (boot)
boot.results<- boot (Books$ P,function (x,b){median (x[b])}, 10000 ) #function for data x
boot.ci (boot.results) # resampling case b
Warning in boot.ci(boot.results): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates
CALL :
boot.ci(boot.out = boot.results)
Intervals :
Level Normal Basic
95% (14.05, 22.87 ) (15.00, 23.00 )
Level Percentile BCa
95% (11, 19 ) (11, 18 )
Calculations and Intervals on Original Scale
library (boot)
boot.results2<- boot (Books$ P,function (x,b){sd (x[b])},100000 )
boot.ci (boot.results2)
Warning in boot.ci(boot.results2): bootstrap variances needed for studentized
intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 100000 bootstrap replicates
CALL :
boot.ci(boot.out = boot.results2)
Intervals :
Level Normal Basic
95% (15.26, 38.12 ) (15.73, 38.33 )
Level Percentile BCa
95% (13.26, 35.86 ) (16.38, 40.12 )
Calculations and Intervals on Original Scale
plot (Books$ C,Books$ P) # scatterplot of C and P shows positive trend
library (boot)
set.seed (54321 ) # ifwewantto be abletoreproduceresults
boot_corr= boot (cbind (Books$ C,Books$ P),function (x,b){cor (x[b,1 ],x[b,2 ])}, 100000 )
boot_corr
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = cbind(Books$C, Books$P), statistic = function(x,
b) {
cor(x[b, 1], x[b, 2])
}, R = 1e+05)
Bootstrap Statistics :
original bias std. error
t1* 0.8121555 -0.01595873 0.09097654
boot.ci (boot_corr,conf= 0.90 )
Warning in boot.ci(boot_corr, conf = 0.9): bootstrap variances needed for
studentized intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 100000 bootstrap replicates
CALL :
boot.ci(boot.out = boot_corr, conf = 0.9)
Intervals :
Level Normal Basic
90% ( 0.6785, 0.9778 ) ( 0.7299, 1.0130 )
Level Percentile BCa
90% ( 0.6113, 0.8944 ) ( 0.6224, 0.8973 )
Calculations and Intervals on Original Scale
hist (boot_corr$ t,xlab= "Correlation" ,breaks= "Scott" )
Bootstrap and Empirical Cumulative Distribution Function
y <- rnorm (10 , 100 , 16 )
plot (ecdf (y), xlab= "y" , ylab= "Empirical CDF" , col= "dodgerblue4" ) # ecdf = empirical cdf
lines (seq (50 , 150 , by= .1 ), pnorm (seq (50 ,150 ,by= .1 ), 100 , 16 ), col= "red4" , lwd= 2 )
Steps:
Library Data
Books <- read.table ("http://stat4ds.rwth-aachen.de/data/Library.dat" , header= TRUE )
head (Books,3 )
## without using the boot package
n <- 54 ; nboot <- 100000
Psample <- matrix (0 , nboot, n) # matrix of nboot rows and n columns
for (i in 1 : nboot) Psample[i,] <- sample (Books$ P, n, replace= TRUE )
MedianBoot <- apply (Psample, 1 , median) # finds median in each row
quantile (MedianBoot, c (0.025 , 0.975 ))
sd (MedianBoot) # standard deviation of the bootstrapped medians
SDBoot <- apply (Psample, 1 , sd)
quantile (SDBoot, c (0.025 , 0.975 )) # 95% percentile CI for standard deviation
2.5% 97.5%
13.47816 35.72565
UN Data
library (boot)
UN <- read.table ("http://stat4ds.rwth-aachen.de/data/UN.dat" , header= T)
head (UN,3 )
Nation GDP HDI GII Fertility CO2 Homicide Prison Internet
1 Algeria 12.8 0.72 0.42 2.8 3.2 0.8 162 17
2 Argentina 14.7 0.81 0.38 2.2 4.7 5.5 147 60
3 Australia 42.3 0.93 0.11 1.9 16.5 1.1 130 83
#.............. examples of functions ............................
y.mean <- function (y, i){ # y: data vector; i: indices vector
return (mean (y[i]))}
y.median <- function (y, i){ # y: data vector; i: indices vector
return (median (y[i]))}
y.var <- function (y, i){ # y: data vector; i: indices vector
return (var (y[i]))}
set.seed (54321 ) # for reproducibility of the example
b_median = boot (UN[,3 ], y.median, R= 1000 ); b_median
ORDINARY NONPARAMETRIC BOOTSTRAP
Call:
boot(data = UN[, 3], statistic = y.median, R = 1000)
Bootstrap Statistics :
original bias std. error
t1* 0.86 -0.01548 0.03484155
boot.ci (b_median, conf= 0.90 ); plot (b_median)
Warning in boot.ci(b_median, conf = 0.9): bootstrap variances needed for
studentized intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates
CALL :
boot.ci(boot.out = b_median, conf = 0.9)
Intervals :
Level Normal Basic
90% ( 0.8182, 0.9328 ) ( 0.8400, 0.9450 )
Level Percentile BCa
90% ( 0.775, 0.880 ) ( 0.775, 0.880 )
Calculations and Intervals on Original Scale
## the Pearson correlation
xy.cor <- function (y, i){
v <- y[i,]
return (cor (v[,1 ], v[,2 ], method= 'p' ))
}
b_cor = boot (cbind (UN[,2 ], UN[,6 ]), xy.cor, R= 1000 )
boot.ci (b_cor, conf= 0.90 ); plot (b_cor)
Warning in boot.ci(b_cor, conf = 0.9): bootstrap variances needed for
studentized intervals
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 1000 bootstrap replicates
CALL :
boot.ci(boot.out = b_cor, conf = 0.9)
Intervals :
Level Normal Basic
90% ( 0.5359, 0.8065 ) ( 0.5462, 0.8170 )
Level Percentile BCa
90% ( 0.5319, 0.8028 ) ( 0.4879, 0.7811 )
Calculations and Intervals on Original Scale
Parametric Bootstrap
Binomial example
alpha <- 0.05
y = c (5 , 2 , 3 , 1 , 4 , 2 )
m = 10 # known size of binomial
n = length (y) # sample size (here: n=6)
thetahat = mean (y)/ m # MLE for theta=$\theta$
nboot = 1000 # number of bootstrap samples to use
# nboot parametric samples of size n:
tmpdata = rbinom (n* nboot, m, thetahat)
# bootstrap samples (organized in a matrix):
bootstrapsample = matrix (tmpdata, nrow= n, ncol= nboot)
# computation of bootstrap estimates thetahat.b:
thetahat.b = colMeans (bootstrapsample)/ m
# derivation of percentile bootstrap CI:
q.CI = quantile (thetahat.b, c (alpha/ 2 ,1 - alpha/ 2 )); q.CI
2.5% 97.5%
0.1666667 0.4000000
Gamma example
library (boot)
# ................ required functions ....................................
y.median <- function (y, i){ # y: data vector; i: indices vector
return (median (y[i]))}
gamma.rg <- function (y,p){ # function to generate random gamma variates
rgamma (length (y), shape= p[1 ], rate= p[2 ])}
#.........................................................................
y <- c (5.88 ,5.55 ,5.40 ,1.83 ,2.31 ,1.32 ,1.52 ,6.79 ,4.99 ,3.87 ,1.21 ,10.44 ,3.71 ,1.68 ,2.53 ,5.40 ,0.17 ,9.00 ,1.41 ,3.37 ,2.99 ,1.68 ,1.73 ,6.43 ,4.16 )
s <- 2 ; r <- 0.5
p <- c (s,r)
gamma_bootmed = boot (y, y.median, R= 10000 , sim = "parametric" , ran.gen= gamma.rg, mle= p)
gamma_bootmed
PARAMETRIC BOOTSTRAP
Call:
boot(data = y, statistic = y.median, R = 10000, sim = "parametric",
ran.gen = gamma.rg, mle = p)
Bootstrap Statistics :
original bias std. error
t1* 3.37 0.03333408 0.6313874
round (median (gamma_bootmed$ t),2 )
round (sd (gamma_bootmed$ t),2 )
Exercises