library(tidyverse)
library(ggridges)
library(boot)

The Data

# load Killarney Birds data
birds <- read_csv("KillarneyBirds.csv")

## Rows: 31 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Species
## dbl (9): Oak1, Oak2, Oak3, Yew, Sitka, Norway, Mixed, Patchy, Swampy
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## learned here that read.csv() will create a dataframe, but read_csv() will create a tibble'd dataframe
birds

## # A tibble: 31 × 10
##    Species            Oak1  Oak2  Oak3   Yew Sitka Norway Mixed Patchy Swampy
##    <chr>             <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl>
##  1 Chaffinch            35    41    31     9    14     30    10     13     15
##  2 Robin                26    35    23    20    10     30    18     12     20
##  3 BlueTit              25    19    17    10     0      3     7      8      5
##  4 Goldcrest            21    27    17    21    30     65    15     22     11
##  5 Wren                 16     8     1     5     4     20    10     14     10
##  6 CoalTit              11     9     8    14     6     11     4      6      5
##  7 SpottedFlycatcher     6     4     0     0     0      0     1      3      0
##  8 TreeCreeper           5     8     4     3     0      4     2      3      2
##  9 Blackbird             3     3     1     6     3     14     7      8      8
## 10 Siskin                3     2     2     2     7      2     3      5      4
## # ℹ 21 more rows

These data consist of the number of reproducing males of 31 different species of bird across 9 habitats in County Killarney, Ireland. Using these data, we can calculate the Simpson’s Diversity Index for each habitat and use bootstrapping to assess any differences in biodiversity between habitats.

Simpson’s Diversity Index

Simpson’s Diversity Index \(D\) is defined as: \[D = 1 - \frac {\Sigma_{i = 1} n_i (n_i - 1)} {N(N-1)}\]

where \(n\) is the number of individuals in the \(i\)th species and \(N\) is the total number of all individuals.

Bootstrapping

First, I made a function that found the SDI of all columns using a loop, then used it with boot to generate bootstrap replicates.

# create function with inputs: x (original data) and indicies (for `boot` function)
sdi_boot_fxn <- function(x, indicies) {
  # establish empty vector
  sdi_vec <- vector()
  # start loop to calculate SDI for each column
  for(i in 1:ncol(x)) {
    # pull single column
    z <- pull(x, i)
    y <- z[indicies]
    N <- sum(y)
    # calculate SDI
    D <- 1 - sum(y * (y - 1)) / (N * (N-1))
    # save to i_th spot in vector
    sdi_vec[i] <- D
  }
  # return vector of SDI values
  return(sdi_vec)
}

# use `boot` to iteratively resample the data and calculate confidence intervals
sdi_boots <- boot(data = birds[,-1], statistic = sdi_boot_fxn, R = 1e4)
sdi_boots

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = birds[, -1], statistic = sdi_boot_fxn, R = 10000)
## 
## 
## Bootstrap Statistics :
##      original       bias    std. error
## t1* 0.8852767 -0.000189095  0.02595320
## t2* 0.8738996  0.002842684  0.02839455
## t3* 0.8347812 -0.003789748  0.05442748
## t4* 0.8910759 -0.001112586  0.02881728
## t5* 0.7780180  0.006646452  0.09325051
## t6* 0.8284879  0.006864062  0.05532239
## t7* 0.8996337  0.001240696  0.02303189
## t8* 0.9179604  0.002315574  0.01589714
## t9* 0.9044444  0.001828893  0.02127255

str(sdi_boots)

## List of 11
##  $ t0       : num [1:9] 0.885 0.874 0.835 0.891 0.778 ...
##  $ t        : num [1:10000, 1:9] 0.906 0.916 0.879 0.918 0.896 ...
##  $ R        : num 10000
##  $ data     : tibble [31 × 9] (S3: tbl_df/tbl/data.frame)
##   ..$ Oak1  : num [1:31] 35 26 25 21 16 11 6 5 3 3 ...
##   ..$ Oak2  : num [1:31] 41 35 19 27 8 9 4 8 3 2 ...
##   ..$ Oak3  : num [1:31] 31 23 17 17 1 8 0 4 1 2 ...
##   ..$ Yew   : num [1:31] 9 20 10 21 5 14 0 3 6 2 ...
##   ..$ Sitka : num [1:31] 14 10 0 30 4 6 0 0 3 7 ...
##   ..$ Norway: num [1:31] 30 30 3 65 20 11 0 4 14 2 ...
##   ..$ Mixed : num [1:31] 10 18 7 15 10 4 1 2 7 3 ...
##   ..$ Patchy: num [1:31] 13 12 8 22 14 6 3 3 8 5 ...
##   ..$ Swampy: num [1:31] 15 20 5 11 10 5 0 2 8 4 ...
##  $ seed     : int [1:626] 10403 1 1145278946 -817011372 -181756841 1008869453 263005156 924352970 1051002933 152241807 ...
##  $ statistic:function (x, indicies)  
##   ..- attr(*, "srcref")= 'srcref' int [1:8] 2 17 18 1 17 1 2 18
##   .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x000002161a47fcb8> 
##  $ sim      : chr "ordinary"
##  $ call     : language boot(data = birds[, -1], statistic = sdi_boot_fxn, R = 10000)
##  $ stype    : chr "i"
##  $ strata   : num [1:31] 1 1 1 1 1 1 1 1 1 1 ...
##  $ weights  : num [1:31] 0.0323 0.0323 0.0323 0.0323 0.0323 ...
##  - attr(*, "class")= chr "boot"
##  - attr(*, "boot_type")= chr "boot"

  # create empty matrix
sdi_boots_ci <- matrix(nrow = length(sdi_boots$t0), ncol = 4) 
  # name columns
colnames(sdi_boots_ci) <- c("t0", "se", "BCa_lwr", "BCa_upr")

for(i in 1:length(sdi_boots$t0)) {
  
  # CI no. 1: standard error
  sdi_boots_ci[i,2] <- sd(sdi_boots$t[,i], na.rm = TRUE)
  # CI no. 2: adjusted bootstrap percentile (BCa)
  loop_boot <- boot.ci(sdi_boots, index = i, type = "bca")
  sdi_boots_ci[i,3:4] <- loop_boot$bca[4:5]
  
}
  # pull t0
sdi_boots_ci[,1] <- as.numeric(sdi_boots$t0)

  # arrange into dataframe
sdi_boots_df <- tibble(data.frame(names(birds[,-1]),
                                  sdi_boots_ci)) 
colnames(sdi_boots_df) <- c("Habitat", "t0", "se", "BCa_lwr", "BCa_upr")
sdi_boots_df <- sdi_boots_df|>
  mutate(Habitat = factor(Habitat, levels = unique(sdi_boots_df$Habitat)))

  # create separate df for density plots
sdi_boots_dens <- data.frame(sdi_boots$t)
colnames(sdi_boots_dens) <- names(birds[,-1])

  # add bootstrap means and index value to confidence interval df
sdi_boots_df$mean <- apply(na.omit(sdi_boots_dens), 2, mean)



sdi_boots_long <- sdi_boots_dens |>
  pivot_longer(cols = 1:9, names_to = "Habitat") |>
  na.omit() |>
  inner_join(sdi_boots_df[,1:2], by = "Habitat") |>
  arrange(t0, value) |>
  mutate(Habitat = factor(Habitat, levels = unique(Habitat))) 

sdi_boots_df <- arrange(sdi_boots_df, -t0)

sdi_mean_df <- rbind(data.frame(Habitat = sdi_boots_df$Habitat, 
                             mean = sdi_boots_df$mean,
                             y = seq(nrow(sdi_boots_df))),
                     data.frame(Habitat = sdi_boots_df$Habitat, 
                             mean = sdi_boots_df$mean,
                             y = seq(nrow(sdi_boots_df))+0.8)) 
row.names(sdi_mean_df) <- NULL

p1 <- ggplot() +
  geom_density_ridges(data = sdi_boots_long, 
                      aes(x = value, y = reorder(Habitat, -t0), 
                          fill = Habitat), alpha = 0.75, color = NA) +
  geom_errorbarh(data = sdi_boots_df,
                 aes(xmin = BCa_lwr, xmax = BCa_upr, y = Habitat, height = 0.5)) +
  geom_errorbarh(data = sdi_boots_df,
                 aes(xmin = mean-se, xmax = mean+se, y = Habitat, height = 0.5),
                 color = "blue") +
  geom_point(data = sdi_boots_df, aes(x = t0, y = Habitat), size = 2) +
  scale_fill_viridis_d() +
  labs(x = "Simpson's Diversity Index", y = "Habitat",
       title = "Bird Biodiversity in County Killarney by Habitat") +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlim(c(0.35, 1)) +
  geom_line(data = sdi_mean_df, aes(x = mean, y = y, group = Habitat),
            color = "darkred", linetype = 2)

p1

## Picking joint bandwidth of 0.00457

## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Figure 1: Bootstrapped bird biodiversity in County Killarney, Ireland, by tree habitat. Points show calculated Simpson’s Diversity Index from measured data; dotted red lines depict the mean bootstrapped SDI. Distributions show the bootstrap replicate values for 10,000 replicates for each habitat. Blue error bars denote standard error, while black error bars show the adjusted bootstrap percentile (BCa) 95% confidence interval.

Though initially there appeared to be a shift in biodiversity across habitats, including bootstrapped confidence intervals/error show that these habitats are not meaningfully or significantly different from one another in terms of bird diversity. We can see this from the plot even without any statistics, as the errors bars overlap quite consistently from one habitat to the next. At most, the Sitka, Norway, and Oak3 habitats might be lower in diversity compared to the Patchy habitat but the significance of that difference is questionable given that the BCa error bars still overlap.

A1 Bootstrap

Luke Ghallahorne

2025-01-15

The Data

Simpson’s Diversity Index

Bootstrapping