Bootstrapping Diversity Indicies with Infer

Load packages

library(infer)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

theme_set(theme_bw(base_size = 16) + theme(aspect.ratio = 1)) # sets a global ggplot2 theme

Generate fake data

set.seed(11)

species_levels <- paste0("sp_", 1:7)

fake_data <- tibble(
  environment = rep(c("env_a", "env_b"), each = 7),
    species = rep(species_levels, times = 2),
    n = c(
        18, 14, 12, 10, 9, 8, 0,   # env_a
        45, 9, 11, 13, 15, 17, 19   # env_b
    )
) |>
    tidyr::uncount(n)

fake_data

# A tibble: 200 × 2
   environment species
   <chr>       <chr>  
 1 env_a       sp_1   
 2 env_a       sp_1   
 3 env_a       sp_1   
 4 env_a       sp_1   
 5 env_a       sp_1   
 6 env_a       sp_1   
 7 env_a       sp_1   
 8 env_a       sp_1   
 9 env_a       sp_1   
10 env_a       sp_1   
# ℹ 190 more rows

Shannon diversity index

Shannon diversity index function for use with infer::calculate(). Copy this function into your document and run it so that you can use with later.

stat_shannon <- function(x, order, ...) {
  species <- x$species
  props <- table(species) / length(species)
  
  if (length(props) == 1) {
    return(0)
  }
  
  # Only sum over non-zero proportions to avoid log(0)
  h <- -sum(props[props > 0] * log(props[props > 0]))
  
  h
}

Calculate observed

obs_a <-
    fake_data |>
    filter(environment == "env_a") |>
    specify(response = species) |>
    calculate(stat = stat_shannon)

obs_b <-
    fake_data |>
    filter(environment == "env_b") |>
    specify(response = species) |>
    calculate(stat = stat_shannon)

Boostrap sampling distribution

boot_a <-
    fake_data |>
    filter(environment == "env_a") |>
    specify(response = species) |>
    generate(reps = 10000, type = "bootstrap") |>
    calculate(stat = stat_shannon)

boot_b <-
    fake_data |>
    filter(environment == "env_b") |>
    specify(response = species) |>
    generate(reps = 10000, type = "bootstrap") |>
    calculate(stat = stat_shannon)

boot_a |>
    visualise()

boot_b |>
    visualise()

Confidence interval

ci_a <- 
    boot_a |>
    get_ci(level = 0.95, type = "percentile")

ci_b <- 
    boot_b |>
    get_ci(level = 0.95, type = "percentile")

ci_a

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.62     1.78

ci_b

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     1.66     1.86

boot_a |>
    visualise() +
    shade_ci(endpoints = ci_a)

boot_b |>
    visualise() +
    shade_ci(endpoints = ci_b)

Plots

obs_shannon <-
    bind_rows(
        env_a = obs_a,
        env_b = obs_b,
        .id = "environment"
    )

ci_shannon <- 
    bind_rows(
        env_a = ci_a,
        env_b = ci_b,
        .id = "environment"
    )

obs_shannon |>
    full_join(ci_shannon, by = join_by(environment)) |>
    ggplot(aes(x = environment, y = stat)) +
    geom_point(size = 4) +
    geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2)

Simpson diversity index

Simpson diversity index function for use with infer::calculate(). Copy this function into your document and run it so that you can use with later.

stat_simpson <- function(x, order, ...) {
  species <- x$species
  props <- table(species) / length(species)
  
  # Simpson index: 1 - sum(p_i^2)
  d <- 1 - sum(props^2)
  
  d
}

Calculate observed

obs_simpson_a <-
    fake_data |>
    filter(environment == "env_a") |>
    specify(response = species) |>
    calculate(stat = stat_simpson)

obs_simpson_b <-
    fake_data |>
    filter(environment == "env_b") |>
    specify(response = species) |>
    calculate(stat = stat_simpson)

Boostrap sampling distribution

boot_simpson_a <-
    fake_data |>
    filter(environment == "env_a") |>
    specify(response = species) |>
    generate(reps = 10000, type = "bootstrap") |>
    calculate(stat = stat_simpson)

boot_simpson_b <-
    fake_data |>
    filter(environment == "env_b") |>
    specify(response = species) |>
    generate(reps = 10000, type = "bootstrap") |>
    calculate(stat = stat_simpson)

boot_simpson_a |>
    visualise()

boot_simpson_b |>
    visualise()

Confidence interval

ci_simpson_a <- 
    boot_simpson_a |>
    get_ci(level = 0.95, type = "percentile")

ci_simpson_b <- 
    boot_simpson_b |>
    get_ci(level = 0.95, type = "percentile")

ci_simpson_a

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    0.774    0.828

ci_simpson_b

# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    0.753    0.830

boot_simpson_a |>
    visualise() +
    shade_ci(endpoints = ci_simpson_a)

boot_simpson_b |>
    visualise() +
    shade_ci(endpoints = ci_simpson_b)

Plots

obs_simpson <-
    bind_rows(
        env_a = obs_simpson_a,
        env_b = obs_simpson_b,
        .id = "environment"
    )

ci_simpson <- 
    bind_rows(
        env_a = ci_simpson_a,
        env_b = ci_simpson_b,
        .id = "environment"
    )

obs_simpson |>
    full_join(ci_simpson, by = join_by(environment)) |>
    ggplot(aes(x = environment, y = stat)) +
    geom_point(size = 4) +
    geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2)