# Install and load required R packages
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# Import and inspect the sites.
cedar_bluffs <- read_excel("/Users/giannaselleck/Documents/RStudio/FinalProject/Transect_Data_Set-2/temp.namerica/CEDARBLU.xls")
## New names:
## • `voucher` -> `voucher...5`
## • `voucher` -> `voucher...6`
## • `voucher` -> `voucher...7`
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `Stemdbh` -> `Stemdbh...15`
## • `Stemdbh` -> `Stemdbh...16`
## • `Stemdbh` -> `Stemdbh...17`
## • `Stemdbh` -> `Stemdbh...18`
## • `Stemdbh` -> `Stemdbh...19`
## • `Stemdbh` -> `Stemdbh...20`
## • `Stemdbh` -> `Stemdbh...21`
## • `Stemdbh` -> `Stemdbh...22`
## • `Stemdbh` -> `Stemdbh...23`
## • `Stemdbh` -> `Stemdbh...24`
## • `Stemdbh` -> `Stemdbh...25`
## • `Stemdbh` -> `Stemdbh...26`
## • `Stemdbh` -> `Stemdbh...27`
## • `Stemdbh` -> `Stemdbh...28`
## • `Stemdbh` -> `Stemdbh...29`
la_selva <- read_excel("/Users/giannaselleck/Documents/RStudio/FinalProject/Transect_Data_Set-2/mesoamerica/LASELVA.xls")
## New names:
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `STEMDBH` -> `STEMDBH...15`
## • `STEMDBH` -> `STEMDBH...16`
## • `STEMDBH` -> `STEMDBH...17`
## • `STEMDBH` -> `STEMDBH...18`
## • `STEMDBH` -> `STEMDBH...19`
## • `STEMDBH` -> `STEMDBH...20`
## • `STEMDBH` -> `STEMDBH...21`
head(cedar_bluffs)
## # A tibble: 6 × 29
##   Line  Family    Genus Species  voucher...5 voucher...6 voucher...7 voucher...8
##   <chr> <chr>     <chr> <chr>    <lgl>       <lgl>       <lgl>       <lgl>      
## 1 1     ACERACEAE ACER  NEGUNDO  NA          NA          NA          NA         
## 2 1     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 3 10    ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 4 2     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 5 3     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## 6 4     ACERACEAE ACER  SACCHAR… NA          NA          NA          NA         
## # ℹ 21 more variables: voucher...9 <lgl>, voucher...10 <lgl>,
## #   voucher...11 <lgl>, voucher...12 <lgl>, Liana <chr>, `N(Ind.)` <dbl>,
## #   Stemdbh...15 <dbl>, Stemdbh...16 <dbl>, Stemdbh...17 <dbl>,
## #   Stemdbh...18 <dbl>, Stemdbh...19 <dbl>, Stemdbh...20 <dbl>,
## #   Stemdbh...21 <dbl>, Stemdbh...22 <dbl>, Stemdbh...23 <dbl>,
## #   Stemdbh...24 <dbl>, Stemdbh...25 <dbl>, Stemdbh...26 <dbl>,
## #   Stemdbh...27 <dbl>, Stemdbh...28 <dbl>, Stemdbh...29 <dbl>
str(cedar_bluffs)
## tibble [65 × 29] (S3: tbl_df/tbl/data.frame)
##  $ Line        : chr [1:65] "1" "1" "10" "2" ...
##  $ Family      : chr [1:65] "ACERACEAE" "ACERACEAE" "ACERACEAE" "ACERACEAE" ...
##  $ Genus       : chr [1:65] "ACER" "ACER" "ACER" "ACER" ...
##  $ Species     : chr [1:65] "NEGUNDO" "SACCHARUM" "SACCHARUM" "SACCHARUM" ...
##  $ voucher...5 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...6 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...7 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...8 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...9 : logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...10: logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...11: logi [1:65] NA NA NA NA NA NA ...
##  $ voucher...12: logi [1:65] NA NA NA NA NA NA ...
##  $ Liana       : chr [1:65] NA NA NA NA ...
##  $ N(Ind.)     : num [1:65] 3 8 15 7 9 7 12 10 11 10 ...
##  $ Stemdbh...15: num [1:65] 8 2.5 4 3 6 8 5 6 9 2.5 ...
##  $ Stemdbh...16: num [1:65] 4 2.5 10 13 4 13 3 9 3 2.5 ...
##  $ Stemdbh...17: num [1:65] 7 8 3 3 12 2.5 7 12 11 6 ...
##  $ Stemdbh...18: num [1:65] NA 38 4 5 15 4 10 6 2.5 5 ...
##  $ Stemdbh...19: num [1:65] NA 4 6 7 7 7 18 4 2.5 7 ...
##  $ Stemdbh...20: num [1:65] NA 27 4 35 9 5 2.5 4 7 13 ...
##  $ Stemdbh...21: num [1:65] NA 12 4 2.5 5 12 20 5 19 9 ...
##  $ Stemdbh...22: num [1:65] NA NA 8 NA 6 NA 6 11 8 8 ...
##  $ Stemdbh...23: num [1:65] NA NA 2.5 NA 8 NA 4 11 9 26 ...
##  $ Stemdbh...24: num [1:65] NA NA 8 NA NA NA 3 9 4 12 ...
##  $ Stemdbh...25: num [1:65] NA NA 7 NA NA NA 2.5 NA 3 NA ...
##  $ Stemdbh...26: num [1:65] NA NA 6 NA NA NA 9 NA NA NA ...
##  $ Stemdbh...27: num [1:65] NA NA 5 NA NA NA NA NA NA NA ...
##  $ Stemdbh...28: num [1:65] NA NA 8 NA NA NA NA NA NA NA ...
##  $ Stemdbh...29: num [1:65] NA NA 9 NA NA NA NA NA NA NA ...
head(la_selva)
## # A tibble: 6 × 21
##    Line FAMILY     GENUS SPECIES voucher1 voucher2 voucher3 voucher4 voucher...9
##   <dbl> <chr>      <chr> <chr>      <dbl>    <dbl>    <dbl>    <dbl> <lgl>      
## 1     1 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 2     2 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 3     3 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 4     4 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 5     5 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## 6     6 ANNONACEAE ANAX… PHAEOC…    78484       NA       NA       NA NA         
## # ℹ 12 more variables: voucher...10 <lgl>, voucher...11 <lgl>,
## #   voucher...12 <lgl>, LIANA <chr>, `N(IND.)` <dbl>, STEMDBH...15 <dbl>,
## #   STEMDBH...16 <dbl>, STEMDBH...17 <dbl>, STEMDBH...18 <dbl>,
## #   STEMDBH...19 <dbl>, STEMDBH...20 <dbl>, STEMDBH...21 <dbl>
str(la_selva)
## tibble [250 × 21] (S3: tbl_df/tbl/data.frame)
##  $ Line        : num [1:250] 1 2 3 4 5 6 7 8 9 10 ...
##  $ FAMILY      : chr [1:250] "ANNONACEAE" "ANNONACEAE" "ANNONACEAE" "ANNONACEAE" ...
##  $ GENUS       : chr [1:250] "ANAXAGOREA" "ANAXAGOREA" "ANAXAGOREA" "ANAXAGOREA" ...
##  $ SPECIES     : chr [1:250] "PHAEOCARPA" "PHAEOCARPA" "PHAEOCARPA" "PHAEOCARPA" ...
##  $ voucher1    : num [1:250] 78484 78484 78484 78484 78484 ...
##  $ voucher2    : num [1:250] NA NA NA NA NA NA NA NA NA NA ...
##  $ voucher3    : num [1:250] NA NA NA NA NA NA NA NA NA NA ...
##  $ voucher4    : num [1:250] NA NA NA NA NA NA NA NA NA NA ...
##  $ voucher...9 : logi [1:250] NA NA NA NA NA NA ...
##  $ voucher...10: logi [1:250] NA NA NA NA NA NA ...
##  $ voucher...11: logi [1:250] NA NA NA NA NA NA ...
##  $ voucher...12: logi [1:250] NA NA NA NA NA NA ...
##  $ LIANA       : chr [1:250] NA NA NA NA ...
##  $ N(IND.)     : num [1:250] 2 6 2 3 4 7 6 4 6 2 ...
##  $ STEMDBH...15: num [1:250] 5 3 3 4 5 7 3 2.5 3 4 ...
##  $ STEMDBH...16: num [1:250] 5 3 5 9 7 7 7 7 7 2.5 ...
##  $ STEMDBH...17: num [1:250] NA 2.5 NA 4 8 4 2.5 3 5 NA ...
##  $ STEMDBH...18: num [1:250] NA 5 NA 7 5 2.5 7 4 3 NA ...
##  $ STEMDBH...19: num [1:250] NA 4 NA NA NA 6 2.5 NA 3 NA ...
##  $ STEMDBH...20: num [1:250] NA 13 NA NA NA 9 4 NA 7 NA ...
##  $ STEMDBH...21: num [1:250] NA NA NA NA NA 7 6 NA NA NA ...
# Clean column names and rename column names for La Selva.
la_selva <- la_selva %>% janitor::clean_names()

la_selva <- la_selva %>%
  mutate(species_name = paste(genus, species))

la_selva_long <- la_selva %>%
  pivot_longer(
    cols = starts_with("stemdbh"),
    names_to = "stem",
    values_to = "dbh"
  )

la_selva_long <- la_selva_long %>%
  filter(!is.na(dbh))

la_selva_clean <- la_selva_long %>%
  select(
    transect = line,
    species = species_name,
    dbh,
    n_ind
  ) %>%
  mutate(site = "La Selva")

glimpse(la_selva_clean)
## Rows: 347
## Columns: 5
## $ transect <dbl> 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6…
## $ species  <chr> "ANAXAGOREA PHAEOCARPA", "ANAXAGOREA PHAEOCARPA", "ANAXAGOREA…
## $ dbh      <dbl> 5.0, 5.0, 3.0, 3.0, 2.5, 5.0, 4.0, 13.0, 3.0, 5.0, 4.0, 9.0, …
## $ n_ind    <dbl> 2, 2, 6, 6, 6, 6, 6, 6, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 7, 7, 7…
## $ site     <chr> "La Selva", "La Selva", "La Selva", "La Selva", "La Selva", "…
# Clean column names and rename column names for Cedar Bluffs.
cedar_bluffs <- cedar_bluffs %>% janitor::clean_names()

cedar_bluffs <- cedar_bluffs %>%
  mutate(species_name = paste(genus, species))

cedar_long <- cedar_bluffs %>%
  pivot_longer(
    cols = starts_with("stemdbh"),
    names_to = "stem",
    values_to = "dbh"
  )

cedar_long <- cedar_long %>%
  filter(!is.na(dbh))

cedar_clean <- cedar_long %>%
  select(
    transect = line,
    species = species_name,
    dbh,
    n_ind
  ) %>%
  mutate(site = "Cedar Bluffs")

glimpse(cedar_clean)
## Rows: 175
## Columns: 5
## $ transect <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "10", "10",…
## $ species  <chr> "ACER NEGUNDO", "ACER NEGUNDO", "ACER NEGUNDO", "ACER SACCHAR…
## $ dbh      <dbl> 8.0, 4.0, 7.0, 2.5, 2.5, 8.0, 38.0, 4.0, 27.0, 12.0, 4.0, 10.…
## $ n_ind    <dbl> 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 15, 15, 15, 15, 15, 15, 15, 15,…
## $ site     <chr> "Cedar Bluffs", "Cedar Bluffs", "Cedar Bluffs", "Cedar Bluffs…
# Combine data from both sites and group by site, transect, and species.
la_selva_clean <- la_selva_clean %>%
  mutate(transect = as.character(transect))

cedar_clean <- cedar_clean %>%
  mutate(transect = as.character(transect))

combined_data <- bind_rows(la_selva_clean, cedar_clean)

combined_data$transect <- factor(
  combined_data$transect,
  levels = sort(unique(as.numeric(combined_data$transect)))
)

combined_data %>%
  group_by(site, transect, species) %>%
  summarise(count = n())
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by site, transect, and species.
## ℹ Output is grouped by site and transect.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(site, transect, species))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
## # A tibble: 314 × 4
## # Groups:   site, transect [20]
##    site         transect species           count
##    <chr>        <fct>    <chr>             <int>
##  1 Cedar Bluffs 1        ACER NEGUNDO          3
##  2 Cedar Bluffs 1        ACER SACCHARUM        7
##  3 Cedar Bluffs 1        QUERCUS RUBRA         1
##  4 Cedar Bluffs 1        ULMUS RUBRA           1
##  5 Cedar Bluffs 2        ACER SACCHARUM        7
##  6 Cedar Bluffs 2        CARYA OVATA           1
##  7 Cedar Bluffs 2        CERCIS CANADENSIS     1
##  8 Cedar Bluffs 2        JUGLANS NIGRA         1
##  9 Cedar Bluffs 2        OSTRYA VIRGINIANA     3
## 10 Cedar Bluffs 2        PRUNUS SEROTINA       2
## # ℹ 304 more rows
# Calculate biodiversity metrics (Simpson’s D, 1-D, 1/D, ESimpson, Shannon H, EShannon)
# for each transect at both sites.
species_counts <- combined_data %>%
  group_by(site, transect, species) %>%
  summarise(n = n(), .groups = "drop")

transect_summary <- species_counts %>%
  group_by(site, transect) %>%
  mutate(
    N = sum(n),
    S = n_distinct(species),
    pi = n / N
  )

biodiversity_indices <- transect_summary %>%
  group_by(site, transect) %>%
  summarise(
    S = first(S),
    N = first(N),
    
    D = sum(pi^2), # Simpson's D
    
    simpson_diversity_index = 1 - D,  # Simpson's index of diversity
    
    simpson_reciprocal_index = 1 / D,  # Simpson's reciprocal index
    
    E_simpson = (1 / D) / S,  # Simpson's index of diversity evenness (ESimpson)
    
    shannon_H = -sum(pi * log(pi)), # Shannon diversity index (H)
    
    E_shannon = shannon_H / log(S), # Shannon evenness (EShannon)
    
    .groups = "drop"
  )

print(biodiversity_indices)
## # A tibble: 20 × 10
##    site         transect     S     N      D simpson_diversity_index
##    <chr>        <fct>    <int> <int>  <dbl>                   <dbl>
##  1 Cedar Bluffs 1            4    12 0.417                    0.583
##  2 Cedar Bluffs 2            8    17 0.232                    0.768
##  3 Cedar Bluffs 3            6    16 0.359                    0.641
##  4 Cedar Bluffs 4            7    17 0.253                    0.747
##  5 Cedar Bluffs 5            6    19 0.429                    0.571
##  6 Cedar Bluffs 6            5    16 0.430                    0.570
##  7 Cedar Bluffs 7            6    20 0.355                    0.645
##  8 Cedar Bluffs 8            6    15 0.467                    0.533
##  9 Cedar Bluffs 9            7    17 0.377                    0.623
## 10 Cedar Bluffs 10          10    26 0.355                    0.645
## 11 La Selva     1           29    37 0.0402                   0.960
## 12 La Selva     2           28    39 0.0572                   0.943
## 13 La Selva     3           29    36 0.0386                   0.961
## 14 La Selva     4           23    27 0.0562                   0.944
## 15 La Selva     5           25    35 0.0547                   0.945
## 16 La Selva     6           21    38 0.0817                   0.918
## 17 La Selva     7           25    44 0.0744                   0.926
## 18 La Selva     8           19    24 0.0694                   0.931
## 19 La Selva     9           23    37 0.0650                   0.935
## 20 La Selva     10          27    30 0.04                     0.96 
## # ℹ 4 more variables: simpson_reciprocal_index <dbl>, E_simpson <dbl>,
## #   shannon_H <dbl>, E_shannon <dbl>
# DBH distribution by transect
ggplot(combined_data, aes(x = dbh, fill = site)) +
  geom_histogram(
    binwidth = 0.1,
    color = "black",
    alpha = 0.7
  ) +
  scale_x_log10() + facet_grid(site ~ transect, scales = "free_y") +
  labs(
    title = "Distribution of DBH by Transect",
    x = "DBH (cm, log scale)",
    y = "Number of Individuals"
  )

# DBH distribution by species
top_species <- combined_data %>%
  count(site, species, sort = TRUE) %>%
  group_by(site) %>%
  slice_head(n = 10) %>%
  ungroup()

filtered_data <- combined_data %>%
  semi_join(top_species, by = c("site", "species"))

ggplot(filtered_data, aes(x = species, y = dbh)) +
  geom_boxplot() +
  facet_wrap(~site) +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(
    title = "DBH Distributions of Dominant Tree Species By Site",
    x = "Species",
    y = "DBH (cm)"
  )

# Comparison of DBH Across Transects Between Sites
ggplot(combined_data, aes(x = transect, y = dbh, color = site)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.3, width = 0.2) +
  scale_y_log10() +
  labs(
    title = "Comparison of DBH Across Transects Between Sites",
    x = "Transect",
    y = "DBH (cm, log scale)"
  )