# Install and load required R packages
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Import and inspect the sites.
cedar_bluffs <- read_excel("/Users/giannaselleck/Documents/RStudio/FinalProject/Transect_Data_Set-2/temp.namerica/CEDARBLU.xls")
## New names:
## • `voucher` -> `voucher...5`
## • `voucher` -> `voucher...6`
## • `voucher` -> `voucher...7`
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `Stemdbh` -> `Stemdbh...15`
## • `Stemdbh` -> `Stemdbh...16`
## • `Stemdbh` -> `Stemdbh...17`
## • `Stemdbh` -> `Stemdbh...18`
## • `Stemdbh` -> `Stemdbh...19`
## • `Stemdbh` -> `Stemdbh...20`
## • `Stemdbh` -> `Stemdbh...21`
## • `Stemdbh` -> `Stemdbh...22`
## • `Stemdbh` -> `Stemdbh...23`
## • `Stemdbh` -> `Stemdbh...24`
## • `Stemdbh` -> `Stemdbh...25`
## • `Stemdbh` -> `Stemdbh...26`
## • `Stemdbh` -> `Stemdbh...27`
## • `Stemdbh` -> `Stemdbh...28`
## • `Stemdbh` -> `Stemdbh...29`
la_selva <- read_excel("/Users/giannaselleck/Documents/RStudio/FinalProject/Transect_Data_Set-2/mesoamerica/LASELVA.xls")
## New names:
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `STEMDBH` -> `STEMDBH...15`
## • `STEMDBH` -> `STEMDBH...16`
## • `STEMDBH` -> `STEMDBH...17`
## • `STEMDBH` -> `STEMDBH...18`
## • `STEMDBH` -> `STEMDBH...19`
## • `STEMDBH` -> `STEMDBH...20`
## • `STEMDBH` -> `STEMDBH...21`
head(cedar_bluffs)
## # A tibble: 6 × 29
##   Line  Family    Genus Species  voucher...5 voucher...6 voucher...7 voucher...8
##   <chr> <chr>     <chr> <chr>    <lgl>       <lgl>       <lgl>       <lgl>      
## 1 1     ACERACEAE ACER  NEGUNDO  NA          NA          NA          NA         
## 2 1     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 3 10    ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 4 2     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 5 3     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 6 4     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## # ℹ 21 more variables: voucher...9 <lgl>, voucher...10 <lgl>,
## #   voucher...11 <lgl>, voucher...12 <lgl>, Liana <chr>, `N(Ind.)` <dbl>,
## #   Stemdbh...15 <dbl>, Stemdbh...16 <dbl>, Stemdbh...17 <dbl>,
## #   Stemdbh...18 <dbl>, Stemdbh...19 <dbl>, Stemdbh...20 <dbl>,
## #   Stemdbh...21 <dbl>, Stemdbh...22 <dbl>, Stemdbh...23 <dbl>,
## #   Stemdbh...24 <dbl>, Stemdbh...25 <dbl>, Stemdbh...26 <dbl>,
## #   Stemdbh...27 <dbl>, Stemdbh...28 <dbl>, Stemdbh...29 <dbl>
str(cedar_bluffs)
## tibble [65 × 29] (S3: tbl_df/tbl/data.frame)
##  $ Line        : chr [1:65] "1" "1" "10" "2" ...
##  $ Family      : chr [1:65] "ACERACEAE" "ACERACEAE" "ACERACEAE" "ACERACEAE" ...
##  $ Genus       : chr [1:65] "ACER" "ACER" "ACER" "ACER" ...
##  $ Species     : chr [1:65] "NEGUNDO" "SACCHARUM" "SACCHARUM" "SACCHARUM" ...
##  $ voucher...5 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...6 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...7 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...8 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...9 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...10: logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...11: logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...12: logi [1:65] NA NA NA NA NA NA ...
##  $ Liana       : chr [1:65] NA NA NA NA ...
##  $ N(Ind.)     : num [1:65] 3 8 15 7 9 7 12 10 11 10 ...
##  $ Stemdbh...15: num [1:65] 8 2.5 4 3 6 8 5 6 9 2.5 ...
##  $ Stemdbh...16: num [1:65] 4 2.5 10 13 4 13 3 9 3 2.5 ...
##  $ Stemdbh...17: num [1:65] 7 8 3 3 12 2.5 7 12 11 6 ...
##  $ Stemdbh...18: num [1:65] NA 38 4 5 15 4 10 6 2.5 5 ...
##  $ Stemdbh...19: num [1:65] NA 4 6 7 7 7 18 4 2.5 7 ...
##  $ Stemdbh...20: num [1:65] NA 27 4 35 9 5 2.5 4 7 13 ...
##  $ Stemdbh...21: num [1:65] NA 12 4 2.5 5 12 20 5 19 9 ...
##  $ Stemdbh...22: num [1:65] NA NA 8 NA 6 NA 6 11 8 8 ...
##  $ Stemdbh...23: num [1:65] NA NA 2.5 NA 8 NA 4 11 9 26 ...
##  $ Stemdbh...24: num [1:65] NA NA 8 NA NA NA 3 9 4 12 ...
##  $ Stemdbh...25: num [1:65] NA NA 7 NA NA NA 2.5 NA 3 NA ...
##  $ Stemdbh...26: num [1:65] NA NA 6 NA NA NA 9 NA NA NA ...
##  $ Stemdbh...27: num [1:65] NA NA 5 NA NA NA NA NA NA NA ...
##  $ Stemdbh...28: num [1:65] NA NA 8 NA NA NA NA NA NA NA ...
##  $ Stemdbh...29: num [1:65] NA NA 9 NA NA NA NA NA NA NA ...
head(la_selva)
## # A tibble: 6 × 21
##    Line FAMILY     GENUS SPECIES voucher1 voucher2 voucher3 voucher4 voucher...9
##   <dbl> <chr>      <chr> <chr>      <dbl>    <dbl>    <dbl>    <dbl> <lgl>      
## 1     1 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 2     2 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 3     3 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 4     4 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 5     5 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 6     6 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## # ℹ 12 more variables: voucher...10 <lgl>, voucher...11 <lgl>,
## #   voucher...12 <lgl>, LIANA <chr>, `N(IND.)` <dbl>, STEMDBH...15 <dbl>,
## #   STEMDBH...16 <dbl>, STEMDBH...17 <dbl>, STEMDBH...18 <dbl>,
## #   STEMDBH...19 <dbl>, STEMDBH...20 <dbl>, STEMDBH...21 <dbl>
str(la_selva)
## tibble [250 × 21] (S3: tbl_df/tbl/data.frame)
##  $ Line        : num [1:250] 1 2 3 4 5 6 7 8 9 10 ...
##  $ FAMILY      : chr [1:250] "ANNONACEAE" "ANNONACEAE" "ANNONACEAE" "ANNONACEAE" ...
##  $ GENUS       : chr [1:250] "ANAXAGOREA" "ANAXAGOREA" "ANAXAGOREA" "ANAXAGOREA" ...
##  $ SPECIES     : chr [1:250] "PHAEOCARPA" "PHAEOCARPA" "PHAEOCARPA" "PHAEOCARPA" ...
##  $ voucher1    : num [1:250] 78484 78484 78484 78484 78484 ...
##  $ voucher2    : num [1:250] NA NA NA NA NA NA NA NA NA NA ...
##  $ voucher3    : num [1:250] NA NA NA NA NA NA NA NA NA NA ...
##  $ voucher4    : num [1:250] NA NA NA NA NA NA NA NA NA NA ...
##  $ voucher...9 : logi [1:250] NA NA NA NA NA NA ...
##  $ voucher...10: logi [1:250] NA NA NA NA NA NA ...
##  $ voucher...11: logi [1:250] NA NA NA NA NA NA ...
##  $ voucher...12: logi [1:250] NA NA NA NA NA NA ...
##  $ LIANA       : chr [1:250] NA NA NA NA ...
##  $ N(IND.)     : num [1:250] 2 6 2 3 4 7 6 4 6 2 ...
##  $ STEMDBH...15: num [1:250] 5 3 3 4 5 7 3 2.5 3 4 ...
##  $ STEMDBH...16: num [1:250] 5 3 5 9 7 7 7 7 7 2.5 ...
##  $ STEMDBH...17: num [1:250] NA 2.5 NA 4 8 4 2.5 3 5 NA ...
##  $ STEMDBH...18: num [1:250] NA 5 NA 7 5 2.5 7 4 3 NA ...
##  $ STEMDBH...19: num [1:250] NA 4 NA NA NA 6 2.5 NA 3 NA ...
##  $ STEMDBH...20: num [1:250] NA 13 NA NA NA 9 4 NA 7 NA ...
##  $ STEMDBH...21: num [1:250] NA NA NA NA NA 7 6 NA NA NA ...
# Clean column names and rename column names for La Selva.
la_selva <- la_selva %>% janitor::clean_names()

la_selva <- la_selva %>%
  mutate(species_name = paste(genus, species))

la_selva_long <- la_selva %>%
  pivot_longer(
    cols = starts_with("stemdbh"),
    names_to = "stem",
    values_to = "dbh"
  )

la_selva_long <- la_selva_long %>%
  filter(!is.na(dbh))

la_selva_clean <- la_selva_long %>%
  select(
    transect = line,
    species = species_name,
    dbh,
    n_ind
  ) %>%
  mutate(site = "La Selva")

glimpse(la_selva_clean)
## Rows: 347
## Columns: 5
## $ transect <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6…
## $ species  <chr> "ANAXAGOREA PHAEOCARPA", "ANAXAGOREA PHAEOCARPA", "ANAXAGOREA…
## $ dbh      <dbl> 5.0, 5.0, 3.0, 3.0, 2.5, 5.0, 4.0, 13.0, 3.0, 5.0, 4.0, 9.0, …
## $ n_ind    <dbl> 2, 2, 6, 6, 6, 6, 6, 6, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 7, 7, 7…
## $ site     <chr> "La Selva", "La Selva", "La Selva", "La Selva", "La Selva", "…
# Clean column names and rename column names for Cedar Bluffs.
cedar_bluffs <- cedar_bluffs %>% janitor::clean_names()

cedar_bluffs <- cedar_bluffs %>%
  mutate(species_name = paste(genus, species))

cedar_long <- cedar_bluffs %>%
  pivot_longer(
    cols = starts_with("stemdbh"),
    names_to = "stem",
    values_to = "dbh"
  )

cedar_long <- cedar_long %>%
  filter(!is.na(dbh))

cedar_clean <- cedar_long %>%
  select(
    transect = line,
    species = species_name,
    dbh,
    n_ind
  ) %>%
  mutate(site = "Cedar Bluffs")

glimpse(cedar_clean)
## Rows: 175
## Columns: 5
## $ transect <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "10", "10",…
## $ species  <chr> "ACER NEGUNDO", "ACER NEGUNDO", "ACER NEGUNDO", "ACER SACCHAR…
## $ dbh      <dbl> 8.0, 4.0, 7.0, 2.5, 2.5, 8.0, 38.0, 4.0, 27.0, 12.0, 4.0, 10.…
## $ n_ind    <dbl> 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 15, 15, 15, 15, 15, 15, 15, 15,…
## $ site     <chr> "Cedar Bluffs", "Cedar Bluffs", "Cedar Bluffs", "Cedar Bluffs…
# Combine data from both sites and group by site, transect, and species.
la_selva_clean <- la_selva_clean %>%
  mutate(transect = as.character(transect))

cedar_clean <- cedar_clean %>%
  mutate(transect = as.character(transect))

combined_data <- bind_rows(la_selva_clean, cedar_clean)

combined_data$transect <- factor(
  combined_data$transect,
  levels = sort(unique(as.numeric(combined_data$transect)))
)

combined_data %>%
  group_by(site, transect, species) %>%
  summarise(count = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by site, transect, and species.
## ℹ Output is grouped by site and transect.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(site, transect, species))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
## # A tibble: 314 × 4
## # Groups:   site, transect [20]
##    site         transect species           count
##    <chr>        <fct>    <chr>             <int>
##  1 Cedar Bluffs 1        ACER NEGUNDO          3
##  2 Cedar Bluffs 1        ACER SACCHARUM        7
##  3 Cedar Bluffs 1        QUERCUS RUBRA         1
##  4 Cedar Bluffs 1        ULMUS RUBRA           1
##  5 Cedar Bluffs 2        ACER SACCHARUM        7
##  6 Cedar Bluffs 2        CARYA OVATA           1
##  7 Cedar Bluffs 2        CERCIS CANADENSIS     1
##  8 Cedar Bluffs 2        JUGLANS NIGRA         1
##  9 Cedar Bluffs 2        OSTRYA VIRGINIANA     3
## 10 Cedar Bluffs 2        PRUNUS SEROTINA       2
## # ℹ 304 more rows
# Calculate biodiversity metrics (Simpson’s D, 1-D, 1/D, ESimpson, Shannon H, EShannon) for each transect at both sites.
species_counts <- combined_data %>%
  group_by(site, transect, species) %>%
  summarise(n = n(), .groups = "drop")

transect_summary <- species_counts %>%
  group_by(site, transect) %>%
  mutate(
    N = sum(n),
    S = n_distinct(species),
    pi = n / N
  )

biodiversity_indices <- transect_summary %>%
  group_by(site, transect) %>%
  summarise(
    S = first(S),
    N = first(N),
    
    D = sum(pi^2), # Simpson's D
    
    simpson_1_D = 1 - D,  # Simpson's index of diversity
    
    simpson_1_over_D = 1 / D,  # Simpson's reciprocal index
    
    E_simpson = (1 / D) / S,  # Simpson's index of diversity evenness (ESimpson)
    
    shannon_H = -sum(pi * log(pi)), # Shannon diversity index (H)
    
    E_shannon = shannon_H / log(S), # Shannon evenness (EShannon)
    
    .groups = "drop"
  )

knitr::kable(
  biodiversity_indices,
  digits = 3,
  caption = "Biodiversity Metrics by Transect for La Selva and Cedar Bluffs",
  booktabs = TRUE
) %>%
  kable_styling(
    full_width = FALSE,
    position = "center"
  ) %>%
  row_spec(0, bold = TRUE) %>%
  column_spec(1:ncol(biodiversity_indices), border_right = TRUE)
Biodiversity Metrics by Transect for La Selva and Cedar Bluffs
site transect S N D simpson_1_D simpson_1_over_D E_simpson shannon_H E_shannon
Cedar Bluffs 1 4 12 0.417 0.583 2.400 0.600 1.075 0.776
Cedar Bluffs 2 8 17 0.232 0.768 4.313 0.539 1.757 0.845
Cedar Bluffs 3 6 16 0.359 0.641 2.783 0.464 1.363 0.761
Cedar Bluffs 4 7 17 0.253 0.747 3.959 0.566 1.624 0.835
Cedar Bluffs 5 6 19 0.429 0.571 2.329 0.388 1.229 0.686
Cedar Bluffs 6 5 16 0.430 0.570 2.327 0.465 1.160 0.721
Cedar Bluffs 7 6 20 0.355 0.645 2.817 0.469 1.347 0.752
Cedar Bluffs 8 6 15 0.467 0.533 2.143 0.357 1.173 0.655
Cedar Bluffs 9 7 17 0.377 0.623 2.651 0.379 1.397 0.718
Cedar Bluffs 10 10 26 0.355 0.645 2.817 0.282 1.589 0.690
La Selva 1 29 37 0.040 0.960 24.891 0.858 3.297 0.979
La Selva 2 28 39 0.057 0.943 17.483 0.624 3.139 0.942
La Selva 3 29 36 0.039 0.961 25.920 0.894 3.314 0.984
La Selva 4 23 27 0.056 0.944 17.780 0.773 3.039 0.969
La Selva 5 25 35 0.055 0.945 18.284 0.731 3.080 0.957
La Selva 6 21 38 0.082 0.918 12.237 0.583 2.784 0.915
La Selva 7 25 44 0.074 0.926 13.444 0.538 2.933 0.911
La Selva 8 19 24 0.069 0.931 14.400 0.758 2.831 0.962
La Selva 9 23 37 0.065 0.935 15.382 0.669 2.955 0.942
La Selva 10 27 30 0.040 0.960 25.000 0.926 3.263 0.990
# Set a global theme for plots
theme_set(
  theme_minimal(base_size = 14) +
    theme(
      plot.title = element_text(size = 18, face = "bold"),
      axis.title = element_text(size = 15),
      axis.text = element_text(size = 14),
      legend.title = element_text(size = 15),
      legend.text = element_text(size = 14)
    )
)
# DBH distribution of all trees by site
ggplot(combined_data, aes(x = site, y = dbh, fill = site)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(alpha = 0.25, width = 0.15, size = 1) +
  scale_y_log10() +
  labs(
    title = "Distribution of Tree DBH by Site",
    x = "Site",
    y = "DBH (cm, log scale)"
  )

# DBH distributions of dominant species by site
top_species <- combined_data %>%
  count(site, species, sort = TRUE) %>%
  group_by(site) %>%
  slice_head(n = 10) %>%
  ungroup()

filtered_data <- combined_data %>%
  semi_join(top_species, by = c("site", "species"))

ggplot(filtered_data, aes(x = species, y = dbh, fill = site)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~site, scales = "free_x") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(
    title = "DBH Distributions of Dominant Tree Species by Site",
    x = "Species",
    y = "DBH (cm)"
  )

# Cumulative distribution of DBH by site
ggplot(combined_data, aes(x = dbh, color = site)) +
  stat_ecdf(size = 1.2) +
  scale_x_log10() +
  labs(
    title = "Cumulative Distribution of Tree DBH by Site",
    x = "DBH (cm, log scale)",
    y = "Cumulative Proportion of Individual Trees"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Mean DBH per transect by site 
transect_means <- combined_data %>%
  group_by(site, transect) %>%
  summarise(mean_dbh = mean(dbh), .groups = "drop")

ggplot(transect_means, aes(x = site, y = mean_dbh, fill = site)) +
  geom_boxplot() +
  geom_jitter(width = 0.1) +
  scale_y_log10() +
  labs(
    title = "Mean Transect DBH by Site",
    x = "Site",
    y = "Mean DBH per Transect (cm, log scale)"
  )

# Calculate average biodiversity metrics across transects for each site
site_summary <- biodiversity_indices %>%
  group_by(site) %>%
  summarise(
    mean_D = mean(D),
    mean_1_D = mean(simpson_1_D),
    mean_1_over_D = mean(simpson_1_over_D),
    mean_Esimpson = mean(E_simpson),
    mean_shannon_H = mean(shannon_H),
    mean_Eshannon = mean(E_shannon),
    .groups = "drop"
  )

knitr::kable(
  site_summary,
  digits = 3,
  caption = "Average Biodiversity Metrics for Each Site",
  booktabs = TRUE
) %>%
  kable_styling(
    full_width = FALSE,
    position = "center"
  ) %>%
  row_spec(0, bold = TRUE) %>%
  column_spec(1:ncol(site_summary), border_right = TRUE)
Average Biodiversity Metrics for Each Site
site mean_D mean_1_D mean_1_over_D mean_Esimpson mean_shannon_H mean_Eshannon
Cedar Bluffs 0.367 0.633 2.854 0.451 1.372 0.744
La Selva 0.058 0.942 18.482 0.735 3.064 0.955
# Shannon diversity comparison between sites
ggplot(biodiversity_indices,
       aes(x = site, y = shannon_H, fill = site)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.1, size = 2, alpha = 0.7) +
  labs(
    title = "Shannon Diversity Index by Site",
    x = "Site",
    y = "Shannon Diversity Index (H)"
  )