library(tidyverse)
library(ggridges)
library(boot)
# load Killarney Birds data
birds <- read_csv("KillarneyBirds.csv")
## Rows: 31 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Species
## dbl (9): Oak1, Oak2, Oak3, Yew, Sitka, Norway, Mixed, Patchy, Swampy
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## learned here that read.csv() will create a dataframe, but read_csv() will create a tibble'd dataframe
birds
## # A tibble: 31 × 10
## Species Oak1 Oak2 Oak3 Yew Sitka Norway Mixed Patchy Swampy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Chaffinch 35 41 31 9 14 30 10 13 15
## 2 Robin 26 35 23 20 10 30 18 12 20
## 3 BlueTit 25 19 17 10 0 3 7 8 5
## 4 Goldcrest 21 27 17 21 30 65 15 22 11
## 5 Wren 16 8 1 5 4 20 10 14 10
## 6 CoalTit 11 9 8 14 6 11 4 6 5
## 7 SpottedFlycatcher 6 4 0 0 0 0 1 3 0
## 8 TreeCreeper 5 8 4 3 0 4 2 3 2
## 9 Blackbird 3 3 1 6 3 14 7 8 8
## 10 Siskin 3 2 2 2 7 2 3 5 4
## # ℹ 21 more rows
These data consist of the number of reproducing males of 31 different species of bird across 9 habitats in County Killarney, Ireland. Using these data, we can calculate the Simpson’s Diversity Index for each habitat and use bootstrapping to assess any differences in biodiversity between habitats.
Simpson’s Diversity Index \(D\) is defined as: \[D = 1 - \frac {\Sigma_{i = 1} n_i (n_i - 1)} {N(N-1)}\]
where \(n\) is the number of individuals in the \(i\)th species and \(N\) is the total number of all individuals.
First, I made a function that found the SDI of all columns using a
loop, then used it with boot to generate bootstrap
replicates.
# create function with inputs: x (original data) and indicies (for `boot` function)
sdi_boot_fxn <- function(x, indicies) {
# establish empty vector
sdi_vec <- vector()
# start loop to calculate SDI for each column
for(i in 1:ncol(x)) {
# pull single column
z <- pull(x, i)
y <- z[indicies]
N <- sum(y)
# calculate SDI
D <- 1 - sum(y * (y - 1)) / (N * (N-1))
# save to i_th spot in vector
sdi_vec[i] <- D
}
# return vector of SDI values
return(sdi_vec)
}
# use `boot` to iteratively resample the data and calculate confidence intervals
sdi_boots <- boot(data = birds[,-1], statistic = sdi_boot_fxn, R = 1e4)
sdi_boots
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = birds[, -1], statistic = sdi_boot_fxn, R = 10000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.8852767 -0.000189095 0.02595320
## t2* 0.8738996 0.002842684 0.02839455
## t3* 0.8347812 -0.003789748 0.05442748
## t4* 0.8910759 -0.001112586 0.02881728
## t5* 0.7780180 0.006646452 0.09325051
## t6* 0.8284879 0.006864062 0.05532239
## t7* 0.8996337 0.001240696 0.02303189
## t8* 0.9179604 0.002315574 0.01589714
## t9* 0.9044444 0.001828893 0.02127255
str(sdi_boots)
## List of 11
## $ t0 : num [1:9] 0.885 0.874 0.835 0.891 0.778 ...
## $ t : num [1:10000, 1:9] 0.906 0.916 0.879 0.918 0.896 ...
## $ R : num 10000
## $ data : tibble [31 × 9] (S3: tbl_df/tbl/data.frame)
## ..$ Oak1 : num [1:31] 35 26 25 21 16 11 6 5 3 3 ...
## ..$ Oak2 : num [1:31] 41 35 19 27 8 9 4 8 3 2 ...
## ..$ Oak3 : num [1:31] 31 23 17 17 1 8 0 4 1 2 ...
## ..$ Yew : num [1:31] 9 20 10 21 5 14 0 3 6 2 ...
## ..$ Sitka : num [1:31] 14 10 0 30 4 6 0 0 3 7 ...
## ..$ Norway: num [1:31] 30 30 3 65 20 11 0 4 14 2 ...
## ..$ Mixed : num [1:31] 10 18 7 15 10 4 1 2 7 3 ...
## ..$ Patchy: num [1:31] 13 12 8 22 14 6 3 3 8 5 ...
## ..$ Swampy: num [1:31] 15 20 5 11 10 5 0 2 8 4 ...
## $ seed : int [1:626] 10403 1 1145278946 -817011372 -181756841 1008869453 263005156 924352970 1051002933 152241807 ...
## $ statistic:function (x, indicies)
## ..- attr(*, "srcref")= 'srcref' int [1:8] 2 17 18 1 17 1 2 18
## .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x000002161a47fcb8>
## $ sim : chr "ordinary"
## $ call : language boot(data = birds[, -1], statistic = sdi_boot_fxn, R = 10000)
## $ stype : chr "i"
## $ strata : num [1:31] 1 1 1 1 1 1 1 1 1 1 ...
## $ weights : num [1:31] 0.0323 0.0323 0.0323 0.0323 0.0323 ...
## - attr(*, "class")= chr "boot"
## - attr(*, "boot_type")= chr "boot"
# create empty matrix
sdi_boots_ci <- matrix(nrow = length(sdi_boots$t0), ncol = 4)
# name columns
colnames(sdi_boots_ci) <- c("t0", "se", "BCa_lwr", "BCa_upr")
for(i in 1:length(sdi_boots$t0)) {
# CI no. 1: standard error
sdi_boots_ci[i,2] <- sd(sdi_boots$t[,i], na.rm = TRUE)
# CI no. 2: adjusted bootstrap percentile (BCa)
loop_boot <- boot.ci(sdi_boots, index = i, type = "bca")
sdi_boots_ci[i,3:4] <- loop_boot$bca[4:5]
}
# pull t0
sdi_boots_ci[,1] <- as.numeric(sdi_boots$t0)
# arrange into dataframe
sdi_boots_df <- tibble(data.frame(names(birds[,-1]),
sdi_boots_ci))
colnames(sdi_boots_df) <- c("Habitat", "t0", "se", "BCa_lwr", "BCa_upr")
sdi_boots_df <- sdi_boots_df|>
mutate(Habitat = factor(Habitat, levels = unique(sdi_boots_df$Habitat)))
# create separate df for density plots
sdi_boots_dens <- data.frame(sdi_boots$t)
colnames(sdi_boots_dens) <- names(birds[,-1])
# add bootstrap means and index value to confidence interval df
sdi_boots_df$mean <- apply(na.omit(sdi_boots_dens), 2, mean)
sdi_boots_long <- sdi_boots_dens |>
pivot_longer(cols = 1:9, names_to = "Habitat") |>
na.omit() |>
inner_join(sdi_boots_df[,1:2], by = "Habitat") |>
arrange(t0, value) |>
mutate(Habitat = factor(Habitat, levels = unique(Habitat)))
sdi_boots_df <- arrange(sdi_boots_df, -t0)
sdi_mean_df <- rbind(data.frame(Habitat = sdi_boots_df$Habitat,
mean = sdi_boots_df$mean,
y = seq(nrow(sdi_boots_df))),
data.frame(Habitat = sdi_boots_df$Habitat,
mean = sdi_boots_df$mean,
y = seq(nrow(sdi_boots_df))+0.8))
row.names(sdi_mean_df) <- NULL
p1 <- ggplot() +
geom_density_ridges(data = sdi_boots_long,
aes(x = value, y = reorder(Habitat, -t0),
fill = Habitat), alpha = 0.75, color = NA) +
geom_errorbarh(data = sdi_boots_df,
aes(xmin = BCa_lwr, xmax = BCa_upr, y = Habitat, height = 0.5)) +
geom_errorbarh(data = sdi_boots_df,
aes(xmin = mean-se, xmax = mean+se, y = Habitat, height = 0.5),
color = "blue") +
geom_point(data = sdi_boots_df, aes(x = t0, y = Habitat), size = 2) +
scale_fill_viridis_d() +
labs(x = "Simpson's Diversity Index", y = "Habitat",
title = "Bird Biodiversity in County Killarney by Habitat") +
theme(plot.title = element_text(hjust = 0.5)) +
xlim(c(0.35, 1)) +
geom_line(data = sdi_mean_df, aes(x = mean, y = y, group = Habitat),
color = "darkred", linetype = 2)
p1
## Picking joint bandwidth of 0.00457
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
Figure 1: Bootstrapped bird biodiversity in County Killarney, Ireland, by tree habitat. Points show calculated Simpson’s Diversity Index from measured data; dotted red lines depict the mean bootstrapped SDI. Distributions show the bootstrap replicate values for 10,000 replicates for each habitat. Blue error bars denote standard error, while black error bars show the adjusted bootstrap percentile (BCa) 95% confidence interval.
Though initially there appeared to be a shift in biodiversity across habitats, including bootstrapped confidence intervals/error show that these habitats are not meaningfully or significantly different from one another in terms of bird diversity. We can see this from the plot even without any statistics, as the errors bars overlap quite consistently from one habitat to the next. At most, the Sitka, Norway, and Oak3 habitats might be lower in diversity compared to the Patchy habitat but the significance of that difference is questionable given that the BCa error bars still overlap.