knitr::opts_chunk$set(echo = TRUE)

load packages

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(purrr)
library(readr)

Load and manipulate the sPlot dataset

load("~/Library/CloudStorage/OneDrive-UniversitadegliStudiRomaTre/POSTDOC/splot dataset/sPlotOpen.RData")

# Obtain unique levels for the PlotObservationID and Species columns
plot_levels <- unique(DT2.oa$PlotObservationID)
species_levels <- unique(DT2.oa$Species)

# Calculate the number of unique species in the dataset
unique_species_count <- DT2.oa %>%
  summarise(unique_count = n_distinct(Species)) # 42529 species
  1. Species thresholds
    The sPlotOpen dataset contains over 42,500 species across more than 95,000 plots. I’ve started applying some filters to reduce the number of species we’ll work on. I’ve tried four different filters, but I think we need to discuss which solution is ideal:
  1. Species present in at least one plot with coverage greater than 5% (~21,700 species)
  2. Species present in at least one plot with coverage greater than 10% (~18,600 species)
  3. Species present in at least one plot with coverage greater than 20% (~14,100 species)
  4. Species present in at least ~0.5% of plots (~680 species)
  5. A combined filter of species present in at least 0.5% of plots or present in at least one plot with coverage greater than 40% (i.e., species that define physiognomies but may only be locally present, like birches)
# Identify species that are present in at least 5% of the rows
species_min_05per_rows <- as.data.frame(as.data.frame(names(which(table(DT2.oa$Species) >= 475))))
species_min_05per_rows <- species_min_05per_rows %>%
  rename(Species = 1) 

# Identify species that have a Relative_cover value greater than 5% at least once
species_abundance_5 <- as.data.frame(unique(DT2.oa$Species[DT2.oa$Relative_cover > 0.05]))
species_abundance_5 <- species_abundance_5 %>%
  rename(Species = 1) 

# Identify species that have a Relative_cover value greater than 10% at least once
species_abundance_10 <- as.data.frame(unique(DT2.oa$Species[DT2.oa$Relative_cover > 0.10]))
species_abundance_10 <- species_abundance_10 %>%
  rename(Species = 1) 

# Identify species that have a Relative_cover value greater than 20% at least once
species_abundance_20 <- as.data.frame(unique(DT2.oa$Species[DT2.oa$Relative_cover > 0.20]))
species_abundance_20 <- species_abundance_20 %>%
  rename(Species = 1) 

# Identify species that occur at least 475 times
species_min_05per_rows <- as.data.frame(names(which(table(DT2.oa$Species) >= 475)))
species_min_05per_rows <- species_min_05per_rows %>%
  rename(Species = 1) 

# Identify species that have a Relative_cover value greater than 40% at least once
species_abundance_40 <- as.data.frame(unique(DT2.oa$Species[DT2.oa$Relative_cover > 0.4]))
species_abundance_40 <- species_abundance_40 %>%
  rename(Species = 1) 

# Combine species sets using an OR condition while maintaining all columns
species_comb_filt <- full_join(species_min_05per_rows, species_abundance_40, by = "Species") 

# Remove any duplicates
species_comb_filt <- unique(species_comb_filt)

# Create a dataframe with Species and a Category column describing the applied filters
species_comb_filt <- as.data.frame(species_comb_filt)
colnames(species_comb_filt) <- "Species"

# Check the classes of the species datasets
class(species_min_05per_rows)
## [1] "data.frame"
class(species_abundance_5)
## [1] "data.frame"
class(species_abundance_10)
## [1] "data.frame"
class(species_abundance_20)
## [1] "data.frame"
class(species_comb_filt)
## [1] "data.frame"
# Combine all species dataframes into one
all_species <- bind_rows(
  species_min_05per_rows %>% mutate(Category = ">= 0.5% rows"),
  species_abundance_5 %>% mutate(Category = "> 5% abundance"),
  species_abundance_10 %>% mutate(Category = "> 10% abundance"),
  species_abundance_20 %>% mutate(Category = "> 20% abundance"),
  species_comb_filt %>% mutate(Category = "combined 5% rows OR > 40% abundance")
)

# Export the combined dataframe as a CSV file
write.csv(all_species, "species_criteria.csv", row.names = FALSE)

Load Flamits data

# Load the Flamits dataset from a specified file path
flamits <- read_delim("~/Library/CloudStorage/OneDrive-UniversitadegliStudiRomaTre/POSTDOC/flamit/data_file.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)
## Rows: 19972 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (28): taxon_name, original_name, var_name, flam_dimension, burning_devic...
## dbl  (5): ID, taxon_ID, var_value, biome, fire
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the taxon_name column to Species for consistency
flamits <- flamits %>%
  rename(Species = taxon_name)

# Count the number of unique species in the Flamits dataset
num_unique_species <- flamits %>%
  summarise(unique_species = n_distinct(Species)) %>%
  pull(unique_species)

# Print the number of unique species found
print(num_unique_species)
## [1] 1790
# Create a list of datasets and their corresponding names
datasets <- list(
  species_min_05per_rows = species_min_05per_rows,
  species_abundance_5 = species_abundance_5,
  species_abundance_10 = species_abundance_10,
  species_abundance_20 = species_abundance_20,
  species_comb_filt = species_comb_filt
)

# Function to count matches between datasets and Flamits, and calculate the percentage
count_matching_species <- function(dataset, name) {
  matching_records <- dataset %>%
    semi_join(flamits, by = "Species")  # Join based on Species column
  
  count_matching <- nrow(matching_records)  # Count of matching records
  total_species <- n_distinct(dataset$Species)  # Total number of species in the dataset
  
  # Calculate the percentage of matching species
  percentage <- ifelse(total_species > 0, count_matching / total_species, 0)
  
  # Return a dataframe with the dataset name, count, and ratio
  return(data.frame(Dataset = name, Count = count_matching, Ratio = percentage))
}

# Execute the matching function for each dataset and combine the results
results_species_flamits <- map2_dfr(datasets, names(datasets), count_matching_species)

# Print the matching results
print(results_species_flamits)
##                  Dataset Count      Ratio
## 1 species_min_05per_rows   150 0.21994135
## 2    species_abundance_5   905 0.04350334
## 3   species_abundance_10   779 0.05389512
## 4   species_abundance_20   617 0.06696332
## 5      species_comb_filt   445 0.08541267
# Ensure the correct column name for genus extraction
flamits <- flamits %>%
  mutate(Genus = word(Species, 1))  # Create a new column for the genus

# List of datasets and their corresponding names
datasets <- list(
  species_min_05per_rows = species_min_05per_rows,
  species_abundance_5 = species_abundance_5,
  species_abundance_10 = species_abundance_10,
  species_abundance_20 = species_abundance_20,
  species_comb_filt = species_comb_filt
)

# Function to count matches by genus and calculate the percentage
count_matching_genus <- function(dataset, name) {
  dataset <- dataset %>%
    mutate(Genus = word(Species, 1))  # Add a column for genus extraction

  # Find unique genera that match with Flamits
  matching_records <- dataset %>%
    semi_join(flamits, by = "Genus") %>%
    distinct(Genus)  # Remove duplicate genera

  # Count matching genera and calculate percentage
  count_matching <- nrow(matching_records)  # Count of matching genera
  total_genus <- n_distinct(dataset$Genus)  # Total number of genera in the dataset

  # Calculate the percentage of matching genera
  percentage <- ifelse(total_genus > 0, count_matching / total_genus, 0)

  # Return a dataframe with the dataset name, count, and ratio
  return(data.frame(Dataset = name, Count = count_matching, Ratio = percentage))
}

# Execute the genus matching function for each dataset and combine the results
results_genus_flamits <- map2_dfr(datasets, names(datasets), count_matching_genus)

# Print the results
print(results_genus_flamits)
##                  Dataset Count     Ratio
## 1 species_min_05per_rows   172 0.4467532
## 2    species_abundance_5   734 0.1588401
## 3   species_abundance_10   684 0.1865794
## 4   species_abundance_20   589 0.2183908
## 5      species_comb_filt   449 0.2542469
# Combine the results from species and genus matching
combined_results <- bind_rows(results_species_flamits, results_genus_flamits)

# Assign source labels to the combined results
combined_results_flamits <- bind_rows(
  results_species_flamits %>% mutate(Source = "Flamits_Species"),
  results_genus_flamits %>% mutate(Source = "Flamits_Genus")
)

Try data

# Load the TRY dataset
library(readxl)
av_TRY <- read_excel("~/Library/CloudStorage/OneDrive-UniversitadegliStudiRomaTre/POSTDOC/TRY_AV.xlsx")

# List of datasets and their corresponding names
datasets <- list(
  species_min_05per_rows = species_min_05per_rows,
  species_abundance_5 = species_abundance_5,
  species_abundance_10 = species_abundance_10,
  species_abundance_20 = species_abundance_20,
  species_comb_filt = species_comb_filt
)

# Function to filter and calculate proportions of TRUE values for traits
process_dataset <- function(dataset_name, species_df) {
  filtered_av_TRY <- av_TRY %>%
    filter(Species %in% species_df$Species)
  
  traits_TF <- filtered_av_TRY %>%
    mutate(across(-Species, ~ . > 0))  # Apply condition "greater than 0" to all columns except 'Species'
  
  proportions_TF <- traits_TF %>%
    summarize(across(-Species, ~ mean(.)))  # Calculate the mean (proportion of TRUE) for each column except 'Species'
  
  proportions_TF <- proportions_TF %>%
    mutate(dataset = dataset_name)  # Add a column to identify the dataset
  
  # Reorder columns to place 'dataset' first
  proportions_TF %>%
    select(dataset, everything())
}

# Apply the function to each dataset
results_10 <- process_dataset("species_abundance_10", species_abundance_10)
results_20 <- process_dataset("species_abundance_20", species_abundance_20)
results_5 <- process_dataset("species_abundance_5", species_abundance_5)
results_min_5 <- process_dataset("species_min_05per_rows", species_min_05per_rows)
results_comb <- process_dataset("species_comb_filt", species_comb_filt)

# Combine all results into a single dataframe
final_results_TRY <- bind_rows(results_10, results_20, results_5, results_min_5, results_comb)

# Print the final results
print(final_results_TRY)
## # A tibble: 5 × 20
##   dataset   .Leaf area per leaf …¹ .Leaf area per leaf …² .Leaf area per leaf …³
##   <chr>                      <dbl>                  <dbl>                  <dbl>
## 1 species_…                  0.282                  0.349                  0.523
## 2 species_…                  0.308                  0.376                  0.550
## 3 species_…                  0.262                  0.318                  0.494
## 4 species_…                  0.748                  0.831                  0.917
## 5 species_…                  0.355                  0.428                  0.592
## # ℹ abbreviated names:
## #   ¹​`.Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA): petiole excluded.`,
## #   ²​`.Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA): petiole included.`,
## #   ³​`.Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA): undefined if petiole is in- or exclu.`
## # ℹ 16 more variables: `.Plant height vegetative.` <dbl>,
## #   `.Seed dry mass.` <dbl>, `.Plant growth rate.` <dbl>,
## #   `.Plant life form (Raunkiaer life form).` <dbl>, …
# Load necessary libraries for string manipulation
library(dplyr)
library(stringr)  # For string manipulation functions like stringr::word()

# Function to extract genus, calculate proportions of TRUE for traits, and add dataset information
process_dataset_by_genus <- function(dataset_name, species_df) {
  filtered_av_TRY <- av_TRY %>%
    filter(Species %in% species_df$Species) %>%
    mutate(Genus = word(Species, 1))  # Extract genus (first word of Species)
  
  # Create a T/F table for values > 0
  traits_TF <- filtered_av_TRY %>%
    mutate(across(-c(Species, Genus), ~ . > 0))
  
  # Group by Genus and calculate the proportion of TRUE for each trait
  proportions_TF <- traits_TF %>%
    group_by(Genus) %>%
    summarize(across(-c(Species), ~ mean(.)))  # Proportion of TRUE for each trait
  
  # Add a column to identify the dataset
  proportions_TF <- proportions_TF %>%
    mutate(dataset = dataset_name)
  
  # Reorder columns to place 'dataset' and 'Genus' first
  proportions_TF %>%
    select(dataset, Genus, everything())
}

# Apply the function to each dataset
results_10 <- process_dataset_by_genus("species_abundance_10", species_abundance_10)
results_20 <- process_dataset_by_genus("species_abundance_20", species_abundance_20)
results_5 <- process_dataset_by_genus("species_abundance_5", species_abundance_5)
results_min_5 <- process_dataset_by_genus("species_min_05per_rows", species_min_05per_rows)
results_comb <- process_dataset_by_genus("species_comb_filt", species_comb_filt)

# Combine all results into a single dataframe
final_results_by_genus <- bind_rows(results_10, results_20, results_5, results_min_5, results_comb)

# Assign TRUE if the value is greater than 0 across the entire dataframe
final_results_by_genus <- final_results_by_genus %>%
  mutate(across(-c(dataset, Genus), ~ . > 0))

# Calculate the percentage of TRUE for each trait based on the dataset
percentage_T <- final_results_by_genus %>%
  group_by(dataset) %>%
  summarize(across(-Genus, ~ mean(.)))  # Calculate the percentage of TRUE
# Combine results from TRY datasets based on genus and species
combined_results_TRY <- bind_rows(
  percentage_T %>% mutate(Source = "TRY_genus"),
  final_results_TRY %>% mutate(Source = "TRY_species")
)

# Combine results from Flamits based on species and genus
combined_results_flamits <- bind_rows(
  results_species_flamits %>% mutate(Source = "Flamits_Species"),
  results_genus_flamits %>% mutate(Source = "Flamits_Genus")
)

# Export combined results to CSV files
write.csv(combined_results_TRY, "TRY_coverage.csv")
write.csv(combined_results_flamits, "flamits_coverage.csv")

# Export combined results to an Excel file
library(openxlsx)
write.xlsx(combined_results_TRY, "TRY_coverage.xlsx")