Bootstrapping and Confidence Intervals

Author

Justin Baumann

Bootstrapping (and eventually confidence intervals)

Resources

Load Packages

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.3.0      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(palmerpenguins)
library(data.table)


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

The following object is masked from 'package:purrr':

    transpose

library(performance)
library(patchwork)
library(rsample) #for lm bootstraps
library(car) #to check collinearity

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(skimr)

Get Penguins data!

penguins<-palmerpenguins::penguins

Fit a simple LM and have a look at results

simple_mod<-lm(bill_depth_mm ~ bill_length_mm*species*sex, data=penguins)

summary(simple_mod)


Call:
lm(formula = bill_depth_mm ~ bill_length_mm * species * sex, 
    data = penguins)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.06730 -0.52452 -0.06471  0.45593  2.90319 

Coefficients:
                                         Estimate Std. Error t value Pr(>|t|)
(Intercept)                              14.84023    1.77185   8.376 1.73e-15
bill_length_mm                            0.07466    0.04749   1.572   0.1169
speciesChinstrap                         -0.25160    2.77579  -0.091   0.9278
speciesGentoo                            -5.76780    2.98938  -1.929   0.0546
sexmale                                   4.92359    2.46355   1.999   0.0465
bill_length_mm:speciesChinstrap          -0.01026    0.06596  -0.155   0.8765
bill_length_mm:speciesGentoo              0.03871    0.07101   0.545   0.5860
bill_length_mm:sexmale                   -0.09177    0.06360  -1.443   0.1500
speciesChinstrap:sexmale                -11.35403    5.67926  -1.999   0.0464
speciesGentoo:sexmale                    -2.41202    3.94469  -0.611   0.5413
bill_length_mm:speciesChinstrap:sexmale   0.24451    0.12006   2.037   0.0425
bill_length_mm:speciesGentoo:sexmale      0.06197    0.09131   0.679   0.4978
                                           
(Intercept)                             ***
bill_length_mm                             
speciesChinstrap                           
speciesGentoo                           .  
sexmale                                 *  
bill_length_mm:speciesChinstrap            
bill_length_mm:speciesGentoo               
bill_length_mm:sexmale                     
speciesChinstrap:sexmale                *  
speciesGentoo:sexmale                      
bill_length_mm:speciesChinstrap:sexmale *  
bill_length_mm:speciesGentoo:sexmale       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.8175 on 321 degrees of freedom
  (11 observations deleted due to missingness)
Multiple R-squared:  0.8334,    Adjusted R-squared:  0.8277 
F-statistic: 145.9 on 11 and 321 DF,  p-value: < 2.2e-16

tidy(simple_mod)

# A tibble: 12 × 5
   term                                    estimate std.error statistic  p.value
   <chr>                                      <dbl>     <dbl>     <dbl>    <dbl>
 1 (Intercept)                              14.8       1.77      8.38   1.73e-15
 2 bill_length_mm                            0.0747    0.0475    1.57   1.17e- 1
 3 speciesChinstrap                         -0.252     2.78     -0.0906 9.28e- 1
 4 speciesGentoo                            -5.77      2.99     -1.93   5.46e- 2
 5 sexmale                                   4.92      2.46      2.00   4.65e- 2
 6 bill_length_mm:speciesChinstrap          -0.0103    0.0660   -0.155  8.77e- 1
 7 bill_length_mm:speciesGentoo              0.0387    0.0710    0.545  5.86e- 1
 8 bill_length_mm:sexmale                   -0.0918    0.0636   -1.44   1.50e- 1
 9 speciesChinstrap:sexmale                -11.4       5.68     -2.00   4.64e- 2
10 speciesGentoo:sexmale                    -2.41      3.94     -0.611  5.41e- 1
11 bill_length_mm:speciesChinstrap:sexmale   0.245     0.120     2.04   4.25e- 2
12 bill_length_mm:speciesGentoo:sexmale      0.0620    0.0913    0.679  4.98e- 1

Now, let’s bootstrap!

Bootstrapping is a resampling technique. We will discuss how it works!

set.seed(356) #any number is fine

penguins_intervals<- reg_intervals(bill_depth_mm ~ bill_length_mm*species*sex, data=penguins, 
                                   type='percentile',
                                   keep_reps=FALSE)

penguins_intervals

# A tibble: 11 × 6
   term                                   .lower .estim…¹  .upper .alpha .method
   <chr>                                   <dbl>    <dbl>   <dbl>  <dbl> <chr>  
 1 bill_length_mm                        -0.0441  7.22e-2  0.188    0.05 percen…
 2 bill_length_mm:sexmale                -0.264  -9.07e-2  0.0827   0.05 percen…
 3 bill_length_mm:speciesChinstrap       -0.138  -2.59e-3  0.135    0.05 percen…
 4 bill_length_mm:speciesChinstrap:sex…  -0.0130  2.39e-1  0.495    0.05 percen…
 5 bill_length_mm:speciesGentoo          -0.0959  4.01e-2  0.171    0.05 percen…
 6 bill_length_mm:speciesGentoo:sexmale  -0.141   5.98e-2  0.266    0.05 percen…
 7 sexmale                               -1.93    4.87e+0 11.6      0.05 percen…
 8 speciesChinstrap                      -6.21   -5.84e-1  4.73     0.05 percen…
 9 speciesChinstrap:sexmale             -23.0    -1.11e+1  0.527    0.05 percen…
10 speciesGentoo                        -11.0    -5.82e+0 -0.426    0.05 percen…
11 speciesGentoo:sexmale                -10.8    -2.30e+0  6.08     0.05 percen…
# … with abbreviated variable name ¹.estimate

#plot the results
penboots<-ggplot(data=penguins_intervals, aes(x=.estimate, y=term))+
  geom_vline(xintercept=0, linetype=2)+
  geom_errorbarh(aes(xmin=.lower, xmax=.upper),height=0.2)+
  geom_point(size=3)+
  theme_classic()
penboots

Understanding resampling and bootstrapping using tidyverse

Let’s take a sample of the penguins data

lilpen<- penguins %>%
  slice_sample(n=10, replace= FALSE) %>%
  select(species, sex, year, bill_length_mm)

lilpen

# A tibble: 10 × 4
   species   sex     year bill_length_mm
   <fct>     <fct>  <int>          <dbl>
 1 Gentoo    female  2007           48.7
 2 Adelie    male    2008           39.6
 3 Gentoo    female  2009           50.5
 4 Adelie    female  2007           40.3
 5 Gentoo    male    2008           44.4
 6 Adelie    female  2009           40.2
 7 Gentoo    female  2007           46.2
 8 Gentoo    female  2007           45.1
 9 Chinstrap female  2007           58  
10 Gentoo    male    2009           52.5

#let's turn resampling on (let's us include duplicates-- we can choose from entire dataset AGAIN when we collect a separate sample)
lilpen2<- penguins %>%
  slice_sample(n=10, replace= TRUE) %>%
  select(species, sex, year, bill_length_mm)

lilpen2 #if we run this enough times we will eventually see duplicates! This is the concept upon which bootstrapping is based

# A tibble: 10 × 4
   species   sex     year bill_length_mm
   <fct>     <fct>  <int>          <dbl>
 1 Adelie    male    2008           40.1
 2 Gentoo    female  2008           45.5
 3 Chinstrap male    2007           51.3
 4 Gentoo    female  2007           46.5
 5 Adelie    female  2008           33.1
 6 Adelie    female  2008           35.5
 7 Adelie    female  2007           39.5
 8 Chinstrap male    2007           48.5
 9 Adelie    female  2009           39.6
10 Gentoo    male    2009           46.8

Now we can scale up (working towards bootstrapping)

n<- 200

orig_sample <- penguins %>%
  slice_sample(n=n, replace=FALSE)

orig_sample

# A tibble: 200 × 8
   species   island bill_length_mm bill_depth_mm flipper_l…¹ body_…² sex    year
   <fct>     <fct>           <dbl>         <dbl>       <int>   <int> <fct> <int>
 1 Adelie    Biscoe           39.7          17.7         193    3200 fema…  2009
 2 Gentoo    Biscoe           47.5          14           212    4875 fema…  2009
 3 Chinstrap Dream            50.5          18.4         200    3400 fema…  2008
 4 Gentoo    Biscoe           46.6          14.2         210    4850 fema…  2008
 5 Chinstrap Dream            46.6          17.8         193    3800 fema…  2007
 6 Adelie    Dream            40.3          18.5         196    4350 male   2008
 7 Gentoo    Biscoe           45.1          14.4         210    4400 fema…  2008
 8 Adelie    Biscoe           35.3          18.9         187    3800 fema…  2007
 9 Gentoo    Biscoe           45.1          14.5         207    5050 fema…  2007
10 Gentoo    Biscoe           42.9          13.1         215    5000 fema…  2007
# … with 190 more rows, and abbreviated variable names ¹flipper_length_mm,
#   ²body_mass_g

#with this sample in hand we can draw a rsample of the sample size and calc mean arrival dealy

orig_sample %>%
  slice_sample(n=n, replace=TRUE) %>%
  summarize(meanbill=mean(bill_length_mm))

# A tibble: 1 × 1
  meanbill
     <dbl>
1       NA

#44.2

#compare to orignal dataset
penguins %>%
  summarize(meanbill=mean(bill_length_mm))

# A tibble: 1 × 1
  meanbill
     <dbl>
1       NA

#44.0 -- different because n=150 in the df but we sampled extra (n=200)

#by repeating this process many times we can see how much variation there is from sample to sample

pen_200_bs<- 1:1000 %>% #1000 = number of trials / resamples
  map_dfr(
    ~orig_sample %>%
      slice_sample(n=n, replace=TRUE) %>%
      summarize(meanbill=mean(bill_length_mm))) %>%
  mutate(n=n)

pen_200_bs #you will see we now have means for 1000 trials!

# A tibble: 1,000 × 2
   meanbill     n
      <dbl> <dbl>
 1     NA     200
 2     NA     200
 3     NA     200
 4     NA     200
 5     43.6   200
 6     NA     200
 7     NA     200
 8     NA     200
 9     44.0   200
10     NA     200
# … with 990 more rows

We can compare outputs to see how things change

pen_200_bs %>%
  skim(meanbill) #mean = 44, sd=0.391

Data summary
Name	Piped data
Number of rows	1000
Number of columns	2
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
meanbill	863	0.14	43.62	0.39	42.58	43.39	43.61	43.86	44.54	▁▃▇▆▂

#histo
bootplot<-ggplot(data=pen_200_bs, aes(x=meanbill))+
         geom_histogram(binwidth=0.1)
bootplot

Warning: Removed 863 rows containing non-finite values (`stat_bin()`).

#check against original df
pen_df_bs<- 1:1000 %>% #1000 = number of trials / resamples
  map_dfr(
    ~penguins %>%
      slice_sample(n=n, replace=TRUE) %>%
      summarize(meanbill=mean(bill_length_mm))) %>%
  mutate(n=n)

pen_df_bs

# A tibble: 1,000 × 2
   meanbill     n
      <dbl> <dbl>
 1     NA     200
 2     NA     200
 3     NA     200
 4     NA     200
 5     NA     200
 6     44.1   200
 7     NA     200
 8     NA     200
 9     NA     200
10     NA     200
# … with 990 more rows

pen_df_bs %>%
  skim(meanbill) #mean=44, sd=0.370

Data summary
Name	Piped data
Number of rows	1000
Number of columns	2
_______________________
Column type frequency:
numeric	1
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
meanbill	685	0.31	43.88	0.38	42.94	43.63	43.88	44.14	44.97	▂▆▇▅▁

#histo
raw<-ggplot(data=pen_df_bs, aes(x=meanbill))+
  geom_histogram(binwidth=0.1)
raw

Warning: Removed 685 rows containing non-finite values (`stat_bin()`).

#compare:
raw/bootplot

Warning: Removed 685 rows containing non-finite values (`stat_bin()`).
Removed 863 rows containing non-finite values (`stat_bin()`).

-The distribution of values we get when we build a series of bootstrap trials is called the bootstrap distribution. It is not exactly the same as the sampling distribution but for sufficiently large n is is a good approximation!

-Remember that if we have a roughly normal distribution we can get 95% CIs by using the rule of thump CI=2SE (or standard error of the mean) #the “real” value here is 1.96SE

calculating boostrapped CIs thus, could look like this

pen_200_bs<- 1:1000 %>% #1000 = number of trials / resamples
  map_dfr(
    ~orig_sample %>%
      slice_sample(n=n, replace=TRUE) %>%
      summarize(meanbill=mean(bill_length_mm))) %>%
  mutate(n=n)

calc_CIs<-pen_200_bs %>%
  summarize(meanbillboot=mean(meanbill), CI=1.96*sd(meanbill))

calc_CIs

# A tibble: 1 × 2
  meanbillboot    CI
         <dbl> <dbl>
1           NA    NA

We did it!

How to use the infer package to do CIs!

library(infer)

orig_sample

# A tibble: 200 × 8
   species   island bill_length_mm bill_depth_mm flipper_l…¹ body_…² sex    year
   <fct>     <fct>           <dbl>         <dbl>       <int>   <int> <fct> <int>
 1 Adelie    Biscoe           39.7          17.7         193    3200 fema…  2009
 2 Gentoo    Biscoe           47.5          14           212    4875 fema…  2009
 3 Chinstrap Dream            50.5          18.4         200    3400 fema…  2008
 4 Gentoo    Biscoe           46.6          14.2         210    4850 fema…  2008
 5 Chinstrap Dream            46.6          17.8         193    3800 fema…  2007
 6 Adelie    Dream            40.3          18.5         196    4350 male   2008
 7 Gentoo    Biscoe           45.1          14.4         210    4400 fema…  2008
 8 Adelie    Biscoe           35.3          18.9         187    3800 fema…  2007
 9 Gentoo    Biscoe           45.1          14.5         207    5050 fema…  2007
10 Gentoo    Biscoe           42.9          13.1         215    5000 fema…  2007
# … with 190 more rows, and abbreviated variable names ¹flipper_length_mm,
#   ²body_mass_g

#dplyr method for mean
orig_sample %>%
  summarize(stat=mean(bill_length_mm))

# A tibble: 1 × 1
   stat
  <dbl>
1    NA

#44.0
x_bar=44.0


#infer method for mean
orig_sample %>%
  specify(response = bill_length_mm) %>%
  calculate(stat='mean')

Warning: Removed 2 rows containing missing values.

Response: bill_length_mm (numeric)
# A tibble: 1 × 1
   stat
  <dbl>
1  43.6

#make bootstrap distribution 
boot_dist <-orig_sample %>%
  specify(response=bill_length_mm) %>%
  generate(reps=1000) %>%
  calculate(stat='mean')

Warning: Removed 2 rows containing missing values.

Setting `type = "bootstrap"` in `generate()`.

boot_dist

Response: bill_length_mm (numeric)
# A tibble: 1,000 × 2
   replicate  stat
       <int> <dbl>
 1         1  43.9
 2         2  43.2
 3         3  43.8
 4         4  43.0
 5         5  43.6
 6         6  43.9
 7         7  44.2
 8         8  43.9
 9         9  42.6
10        10  43.6
# … with 990 more rows

#look at the histo
visualize(boot_dist)

#percentile based CIs
percentile_ci <- boot_dist %>% 
  get_confidence_interval(level = 0.95, type = "percentile")

percentile_ci

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     42.9     44.3

#graphically....
visualize(boot_dist) + 
  shade_confidence_interval(endpoints = percentile_ci)

#CIs via standard error
se_CI<- boot_dist %>%
  get_confidence_interval(type='se', point_estimate = x_bar) #where x_bar is the original sample mean

Using `level = 0.95` to compute confidence interval.

se_CI

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     43.2     44.8

### let's see how the CI values line up: 
calc_CIs

# A tibble: 1 × 2
  meanbillboot    CI
         <dbl> <dbl>
1           NA    NA

44-0.783 #43.217

[1] 43.217

44+0.783 #44.783

[1] 44.783

percentile_ci #43.2, 44.8

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     42.9     44.3

se_CI # 43.2, 44.8

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     43.2     44.8

#all super close!