Explorations in statistics: the bootstrap

Douglas Curran-Everett
Explorations in statistics: the bootstrap
Advan Physiol Educ 2009 33:286-292;
doi:10.1152/advan.00062.2009

#
# -- Advances_Statistics_Code_Boot.R: R code for Explorations in Statistics ----

# -- Load the boot package and set up data graphics ----------------------------
#
library ( boot )                                             #  load the boot package

par     ( mfrow = c( 1, 1 ) )                                #  1 graphic on 1 screen
par     ( las = 1 )                                          #  make axis numbers horizontal
par     ( adj = 0.5 )                                        #  center titles and labels


# -- Define population parameters, sample numbers, and plotting ranges ---------
#
PopMean        <-    0                                       #  population mean
PopSD          <-    1                                       #  population standard deviation

PlotFactor     <- 1000                                       #  scaling factor for horizontal axes
ybarPlotMin    <- PlotFactor *  -6                           #  sample mean: minimum for plotting
ybarPlotMax    <- PlotFactor *   6                           #  sample mean: maximum for plotting

Alpha          <-     0.10                                   #  critical significance level
Conf_Level     <-     1 - Alpha                              #  confidence level

nBoot          <- 10000                                      #  number of bootstrap replications
mean.boot      <- function ( x, i ) mean ( x[i] )            #  necessary for bootstrap


# -- The Bootstrap :: Table 1 --------------------------------------------------                    # -- Table 1: first line
#
Sample_1_Data  <- c ( 0.422, 1.103, 1.006, 1.034, 0.285, -0.647, 1.235, 0.912 ,1.825 )
Sample_1_Data
## [1]  0.422  1.103  1.006  1.034  0.285 -0.647  1.235  0.912  1.825

nObs           <- length ( Sample_1_Data )                   #  number of observations in the sample

# -- Generate and print 3 bootstrap samples ---------------------------------
#
Bootstrap_Data <- matrix ( nrow = nObs, ncol = 2 )           #  matrix for bootstrap data

for ( i in 1:3 )
    { TheData     <- trunc ( runif ( nObs, min = 1, max = nObs+1 ) )
      for ( j in 1:nObs )
          { Bootstrap_Data [ j, 1 ] <- TheData [ j ]
            Bootstrap_Data [ j, 2 ] <- Sample_1_Data [ TheData [ j ] ]
            }
      print ( Bootstrap_Data [, 2 ] )
      Sample_Mean <- round ( mean ( Bootstrap_Data [, 2 ] ), 3 )
      print ( Sample_Mean )
      }                                                                                             # -- Table 1: last line
## [1] 1.235 1.006 0.422 1.235 0.422 1.825 1.825 1.235 1.103
## [1] 1.145
## [1] 0.422 1.103 0.285 0.422 1.103 1.235 0.422 1.103 1.034
## [1] 0.792
## [1] 1.103 1.103 0.912 1.235 1.034 1.103 0.285 1.825 1.034
## [1] 1.07


# -- The Bootstrap :: Figure 1 -------------------------------------------------                    # -- Figure 1: first line
#
Sample_1_Bootstrap <- boot( Sample_1_Data, mean.boot, nBoot )

# -- Calculate percentiles of bootstrap distribution ---------------------------
#
L99 <- round ( quantile ( Sample_1_Bootstrap$t, probs = c ( 0.005 ) ), 3 )
L95 <- round ( quantile ( Sample_1_Bootstrap$t, probs = c ( 0.025 ) ), 3 )
L90 <- round ( quantile ( Sample_1_Bootstrap$t, probs = c ( 0.05  ) ), 3 )
U90 <- round ( quantile ( Sample_1_Bootstrap$t, probs = c ( 0.95  ) ), 3 )
U95 <- round ( quantile ( Sample_1_Bootstrap$t, probs = c ( 0.975 ) ), 3 )
U99 <- round ( quantile ( Sample_1_Bootstrap$t, probs = c ( 0.995 ) ), 3 )

hist ( Sample_1_Bootstrap$t,                                 #  plot bootstrap distribution
       axes =  FALSE,
       xlab = "Bootstrap sample mean",
       ylab = " ",
       main = paste ( "Distribution of", nBoot, "bootstrap means" ),
       )
axis ( 1 )
#
segments ( L99, 0, L99, nBoot, col = "gray50" )              #  99th percentiles
segments ( U99, 0, U99, nBoot, col = "gray50" )
segments ( L95, 0, L95, nBoot, col = "gray50" )              #  95th percentiles
segments ( U95, 0, U95, nBoot, col = "gray50" )
segments ( L90, 0, L90, nBoot, col = "gray50" )              #  90th percentiles
segments ( U90, 0, U90, nBoot, col = "gray50" )

plot of chunk unnamed-chunk-1


Bootstrap_Example <- recordPlot ( )                          #  store data graphic                  # -- Figure 1: last line


# -- Get average and standard deviation of bootstrap replications --------------
#
print ( round ( mean ( Sample_1_Bootstrap$t ), 3 ) )         #  Ave { bootstrap means }
## [1] 0.799
print ( round ( sd   ( Sample_1_Bootstrap$t ), 3 ) )         #  SD  { bootstrap means }
## [1] 0.223


# -- The Bootstrap :: Figure 2 -------------------------------------------------                    # -- Figure 2: first line
#
par     ( mfrow = c( 2, 2 ) )                                #  4 graphics on 1 screen

nObs     <- 100                                              #  sample size

xy       <- c ( ybarPlotMin:ybarPlotMax ) / PlotFactor       #  generate theoretical distribution
N_0_1    <- dnorm  ( xy, mean = PopMean, sd = PopSD )        #  normal distribution
L_0_1    <- dlnorm ( xy, mean = PopMean, sd = PopSD )        #  log normal distribution

N_x_Min  <- -4 * PopSD                                       #  minimum x for normal distribution
N_x_Max  <-  4 * PopSD                                       #  maximum x for normal distribution
N_y_Max  <- max ( N_0_1 )                                    #  maximum y for normal distribution
L_y_Max  <- max ( L_0_1 )                                    #  maximum y for log normal distribution


# -- The Bootstrap :: Figure 2 ... normal distribution -------------------------
#
TheData <- round ( rnorm ( nObs, mean = PopMean, sd = PopSD ), 4 )

plot  ( 0, 0,                                                #  normal distribution
        axes       =  FALSE,
        frame.plot =  FALSE,
        main       = "Normal distribution",
        xlab       = " ",
        ylab       = " ",
        xlim       =  c ( N_x_Min, N_x_Max ),
        ylim       =  c ( 0, N_y_Max ),
        type       = "n"
        )
axis  ( 1 )
lines ( xy, N_0_1, col = "black"  )

qqnorm ( TheData,                                            #  normal distribution: normal quantile plot
         main = paste ( "Normal distribution: n = ", nObs ),
         xlab = "Theoretical quantiles",
         ylab = "Empirical quantiles",
         pch = 19,
         frame.plot = FALSE,
         )
qqline ( TheData )


# -- The Bootstrap :: Figure 2 ... nonnormal distribution ----------------------
#
LogData <- round ( rlnorm ( nObs, mean = PopMean, sd = PopSD ), 4 )

plot ( 0, 0,                                                 #  nonnormal distribution: log normal distribution
       axes       =  FALSE,
       frame.plot =  FALSE,
       main       = "Nonnormal distribution",
       xlab       = " ",
       ylab       = " ",
       xlim       =  c ( 0, N_x_Max ),
       ylim       =  c ( 0, L_y_Max ),
       type       = "n"
       )
axis  ( 1 )
lines ( xy, L_0_1, col = "black"  )

qqnorm ( LogData,                                            #  nonnormal distribution: normal quantile plot
         main = paste ( "Nonnormal distribution: n = ", nObs ),
         xlab = "Theoretical quantiles",
         ylab = "Empirical quantiles",
         pch = 19,
         frame.plot = FALSE,
         )
qqline ( LogData )

plot of chunk unnamed-chunk-1


Normal_Quantile_Plots <- recordPlot ( )                      #  store data graphic                  # -- Figure 2: last line


# -- The Bootstrap :: Figure 3 -------------------------------------------------                    # -- Figure 3: first line
#
par     ( mfrow = c( 1, 1 ) )                                #  1 graphic on 1 screen

qqnorm ( Sample_1_Bootstrap$t,
         main = paste ( "Normal quantile plot: sample 1" ),
         xlab = "Theoretical quantiles",
         ylab = "Empirical quantiles",
         frame.plot = FALSE,
         )
qqline ( Sample_1_Bootstrap$t )

plot of chunk unnamed-chunk-1


Sample_1_Normal_Quantile_Plot <- recordPlot ( )              #  store data graphic                  # -- Figure 3: last line


# -- The Bootstrap :: Figure 4 -------------------------------------------------                    # -- Figure 4: first line
#
Sample_1000_Data <- c ( 0.560, -1.138, 0.485, -0.864, -0.277, 2.198, 0.050, 0.500, 0.587 )
Sample_1000_Data
## [1]  0.560 -1.138  0.485 -0.864 -0.277  2.198  0.050  0.500  0.587

Sample_1000_Bootstrap <- boot( Sample_1000_Data, mean.boot, nBoot )

qqnorm ( Sample_1000_Bootstrap$t,
         main = paste ( "Normal quantile plot: sample 1000" ),
         xlab = "Theoretical quantiles",
         ylab = "Empirical quantiles",
         frame.plot = FALSE,
         )
qqline ( Sample_1000_Bootstrap$t )

plot of chunk unnamed-chunk-1


Sample_1000_Normal_Quantile_Plot <- recordPlot ( )           #  store data graphic                  # -- Figure 4: last line


# -- The Bootstrap :: Confidence intervals: sample 1 ---------------------------
#
boot.ci ( Sample_1_Bootstrap,
          conf = Conf_Level,
          type = c ( "norm", "perc", "bca" )
          )
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = Sample_1_Bootstrap, conf = Conf_Level, type = c("norm", 
##     "perc", "bca"))
## 
## Intervals : 
## Level      Normal             Percentile            BCa          
## 90%   ( 0.4291,  1.1623 )   ( 0.4066,  1.1451 )   ( 0.3444,  1.1032 )  
## Calculations and Intervals on Original Scale


# -- The Bootstrap in Data Transformation --------------------------------------
#
par     ( mfrow = c( 2, 2 ) )                                #  4 graphics on 1 screen


# -- Actual CRP data and bootstrap of sample mean ------------------------------
#
Temp_CRP_Data <- c (  0,    3.9,  5.64, 8.22, 0,     5.62,  3.92, 6.81, 30.61,  0,
                     73.2,  0,   46.7,  0,    0,    26.41, 22.82, 0,     0,     3.49,
                      0,    0,    4.81, 9.57, 5.36,  0,     5.66, 0,    59.76, 12.38,
                     15.74, 0,    0,    0,    0,     9.37, 20.78, 7.1,   7.89,  5.53
                     )

CRP_Obs          <- length ( Temp_CRP_Data )
CRP_Data         <- matrix ( nrow = CRP_Obs, ncol = 2 )
CRP_Data [ , 1 ] <- Temp_CRP_Data
#
CRP_Bootstrap_Data <- boot( CRP_Data [ , 1 ], mean.boot, nBoot )


# -- Log-transformed CRP data and bootstrap of sample mean ---------------------
#
CRP_Data [ , 2 ] <- log ( CRP_Data [ , 1 ] + 1 )             #  ln ( CRP + 1 )
CRP_Data
##        [,1]  [,2]
##  [1,]  0.00 0.000
##  [2,]  3.90 1.589
##  [3,]  5.64 1.893
##  [4,]  8.22 2.221
##  [5,]  0.00 0.000
##  [6,]  5.62 1.890
##  [7,]  3.92 1.593
##  [8,]  6.81 2.055
##  [9,] 30.61 3.453
## [10,]  0.00 0.000
## [11,] 73.20 4.307
## [12,]  0.00 0.000
## [13,] 46.70 3.865
## [14,]  0.00 0.000
## [15,]  0.00 0.000
## [16,] 26.41 3.311
## [17,] 22.82 3.171
## [18,]  0.00 0.000
## [19,]  0.00 0.000
## [20,]  3.49 1.502
## [21,]  0.00 0.000
## [22,]  0.00 0.000
## [23,]  4.81 1.760
## [24,]  9.57 2.358
## [25,]  5.36 1.850
## [26,]  0.00 0.000
## [27,]  5.66 1.896
## [28,]  0.00 0.000
## [29,] 59.76 4.107
## [30,] 12.38 2.594
## [31,] 15.74 2.818
## [32,]  0.00 0.000
## [33,]  0.00 0.000
## [34,]  0.00 0.000
## [35,]  0.00 0.000
## [36,]  9.37 2.339
## [37,] 20.78 3.081
## [38,]  7.10 2.092
## [39,]  7.89 2.185
## [40,]  5.53 1.876
#
log_CRP_Bootstrap_Data <- boot( CRP_Data [ , 2 ], mean.boot, nBoot )


# -- The Bootstrap in Data Transformation :: Figure 6                       ---                     # -- Figure 6: first line
#
hist   ( CRP_Data [ , 1 ],                                   #  Actual CRP data
         axes =  FALSE,
         xlab = " ",
         ylab = " ",
         main = paste ( "Actual: n = ", CRP_Obs )
         )
axis ( 1 )
#
qqnorm ( CRP_Data [ , 1 ], main = "Normal quantile plot", pch = 19, frame.plot = FALSE )
qqline ( CRP_Data [ , 1 ] )

hist   ( CRP_Data [ , 2 ],                                   #  Transformed CRP data
         axes =  FALSE,
         xlab = " ",
         ylab = " ",
         main = paste ( "Transformed: n = ", CRP_Obs )
         )
axis ( 1 )
#
qqnorm ( CRP_Data [ , 2 ], main = "Normal quantile plot", pch = 19, frame.plot = FALSE  )
qqline ( CRP_Data [ , 2 ] )

plot of chunk unnamed-chunk-1


Actual_Log_Data_Graphic <- recordPlot ( )                    #  store data graphic                  # -- Figure 6: last line


# -- The Bootstrap in Data Transformation :: Figure 7                       ---                     # -- Figure 7: first line
#
hist   ( CRP_Bootstrap_Data$t,                               #  Actual CRP data
         axes =  FALSE,
         xlab = " ",
         ylab = " ",
         main = paste ( "Actual:", nBoot, "bootstrap means" )
         )
axis ( 1 )
#
qqnorm ( CRP_Bootstrap_Data$t, main = "Normal quantile plot", frame.plot = FALSE  )
qqline ( CRP_Bootstrap_Data$t )

hist   ( log_CRP_Bootstrap_Data$t,                           #  Transformed CRP data
         axes =  FALSE,
         xlab = " ",
         ylab = " ",
         main = paste ( "Transformed:", nBoot, "bootstrap means" )
         )
axis ( 1 )
#
qqnorm ( log_CRP_Bootstrap_Data$t, main = "Normal quantile plot", frame.plot = FALSE  )
qqline ( log_CRP_Bootstrap_Data$t )

plot of chunk unnamed-chunk-1


Actual_Log_Bootstrap_Graphic <- recordPlot ( )               #  store data graphic                  # Figure 7: last line


# -- Replay each data graphic --------------------------------------------------
#
replayPlot ( Bootstrap_Example                )              #  Figure 1

plot of chunk unnamed-chunk-1

replayPlot ( Normal_Quantile_Plots            )              #  Figure 2

plot of chunk unnamed-chunk-1

replayPlot ( Sample_1_Normal_Quantile_Plot    )              #  Figure 3

plot of chunk unnamed-chunk-1

replayPlot ( Sample_1000_Normal_Quantile_Plot )              #  Figure 4

plot of chunk unnamed-chunk-1


replayPlot ( Actual_Log_Data_Graphic          )              #  Figure 6

plot of chunk unnamed-chunk-1

replayPlot ( Actual_Log_Bootstrap_Graphic     )              #  Figure 7

plot of chunk unnamed-chunk-1