Aspergillus_Analysis.Rmd

Dataset Overview

# Check column names
colnames(datanew2)
##  [1] "gbifID"                           "datasetKey"                      
##  [3] "occurrenceID"                     "kingdom"                         
##  [5] "phylum"                           "class"                           
##  [7] "order"                            "family"                          
##  [9] "genus"                            "species"                         
## [11] "infraspecificEpithet"             "taxonRank"                       
## [13] "scientificName"                   "verbatimScientificName"          
## [15] "verbatimScientificNameAuthorship" "countryCode"                     
## [17] "locality"                         "stateProvince"                   
## [19] "occurrenceStatus"                 "individualCount"                 
## [21] "publishingOrgKey"                 "decimalLatitude"                 
## [23] "decimalLongitude"                 "coordinateUncertaintyInMeters"   
## [25] "coordinatePrecision"              "elevation"                       
## [27] "elevationAccuracy"                "depth"                           
## [29] "depthAccuracy"                    "eventDate"                       
## [31] "day"                              "month"                           
## [33] "year"                             "taxonKey"                        
## [35] "speciesKey"                       "basisOfRecord"                   
## [37] "institutionCode"                  "collectionCode"                  
## [39] "catalogNumber"                    "recordNumber"                    
## [41] "identifiedBy"                     "dateIdentified"                  
## [43] "license"                          "rightsHolder"                    
## [45] "recordedBy"                       "typeStatus"                      
## [47] "establishmentMeans"               "lastInterpreted"                 
## [49] "mediaType"                        "issue"                           
## [51] "X"
# Count total Aspergillus records
total_records <- nrow(datanew2)

# Replace empty species names with "Aspergillus sp."
datanew2$species[datanew2$species == "" & datanew2$scientificName == "Aspergillus P.Micheli, 1729"] <- "Aspergillus sp"
datanew2$species[datanew2$species == ""] <- "Aspergillus sp"

# Count unique species (excluding "Aspergillus sp")
num_species <- length(unique(datanew2$species[datanew2$species != "Aspergillus sp"]))

# Count genus-level records (excluded from plot)
genus_level_records <- sum(datanew2$species == "Aspergillus sp")

# Print summary
total_records
## [1] 92116
num_species
## [1] 517
genus_level_records
## [1] 13693

Species Abundance Plot

# Remove "Aspergillus sp." for species-level analysis
filtered_data <- datanew2[datanew2$species != "Aspergillus sp", ]

# Create a species abundance table
species_abundance <- data.frame(table(filtered_data$species))

# Remove empty species names (if any)
species_abundance <- species_abundance[species_abundance$Var1 != "", ]

# Filter species with at least 1000 occurrences
filtered_species <- subset(species_abundance, Freq > 1000)

# Ensure correct sorting
filtered_species$Var1 <- factor(filtered_species$Var1, levels = filtered_species$Var1[order(-filtered_species$Freq)])

# List of species to highlight
highlight_species <- c("Aspergillus niger", "Aspergillus flavus", "Aspergillus fumigatus", 
                       "Aspergillus nidulans", "Aspergillus terreus")

# Create the plot
ggplot(filtered_species, aes(x = Var1, y = Freq)) +
  geom_bar(stat = "identity", fill = "navy") +
  xlab("Species") +
  ylab("Abundance") + 
  ylim(c(0, 10000)) +  
  theme(
    panel.background = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(color = "black", size = 1),
    axis.text.y = element_text(color = "black", size = 10),
    axis.text.x = element_text(color = "black", size = 10, angle = 45, hjust = 1),
    legend.position = "none"
  ) +
  geom_text(
    aes(label = ifelse(Var1 %in% highlight_species, "*", "")),
    vjust = 0, color = "black", fontface = "bold", size = 5
  ) +
  guides(fill = "none")
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Conclusion

cat("This plot shows the abundance of *Aspergillus* species in the dataset obtained from [GBIF.org (11 March 2025)](https://doi.org/10.15468/dl.zgyxs9).\n\n")
## This plot shows the abundance of *Aspergillus* species in the dataset obtained from [GBIF.org (11 March 2025)](https://doi.org/10.15468/dl.zgyxs9).
cat("The dataset includes a total of **", total_records, "** registered *Aspergillus* records, with **", num_species, "** distinct species identified.\n\n")
## The dataset includes a total of ** 92116 ** registered *Aspergillus* records, with ** 517 ** distinct species identified.
cat("Additionally, **", genus_level_records, "** records were identified only at the **genus level** (*Aspergillus sp.*) and excluded from the plot to focus on identified species.\n\n")
## Additionally, ** 13693 ** records were identified only at the **genus level** (*Aspergillus sp.*) and excluded from the plot to focus on identified species.
cat("The most abundant species include *Aspergillus niger*, *Aspergillus flavus*, and *Aspergillus fumigatus*, among others. The plot only includes species with **more than 1000 occurrences**.")
## The most abundant species include *Aspergillus niger*, *Aspergillus flavus*, and *Aspergillus fumigatus*, among others. The plot only includes species with **more than 1000 occurrences**.