FINAL-R-script.R

### BIODIVERSITY METRICS: FINAL SCRIPT

### Background info ---

# The dataset I used is the one from class (california_species_abundances). 
# After cleaning, it includes abundance data for 21 bird species across 10 sites 
# in California. These include coastal, foothill, montane, wetland, and desert-edge habitats.
# Each column represents a species, and each row represents a site. 
# Abundance values represent the number of individuals reported at each site.
# Using this data, the report analyzes how species richness, evenness, 
# and differences in species composition vary across different California ecosystems.

# Bird species included: California Quail, American Kestrel, Western Bluebird, 
# Acorn Woodpecker, Red-tailed Hawk, Great Egret, Snowy Plover, Peregrine Falcon,
# Northern Flicker, Western Meadowlark, Turkey Vulture, Barn Owl, Anna's Hummingbird,
# Common Raven, Osprey, Black Phoebe, Yellow-rumped Warbler, Spotted Towhee,
# Steller's Jay, and California Thrasher.

# Sites included: Yosemite Valley, Sierra Foothills, Point Reyes, Lake Tahoe, 
# Channel Islands, Central Valley Wetlands, Salton Sea, Lassen Volcanic Park, 
# Elkhorn Slough, and Lake Berryessa.


### 1. Load libraries ---

library(readxl)

## Warning: package 'readxl' was built under R version 4.5.2

library(vegan)

## Warning: package 'vegan' was built under R version 4.5.2

## Loading required package: permute

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(patchwork)

## Warning: package 'patchwork' was built under R version 4.5.2

### 2. Import dataset ---

# (project working directory should be the folder that contains this Excel file)
california_species <- read_excel("california_species_abundances_FINAL.xlsx")

# Look at the data
str(california_species)

## tibble [20 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Species                : chr [1:20] "California Quail" "American Kestrel" "Western Bluebird" "Acorn Woodpecker" ...
##  $ Yosemite Valley        : num [1:20] 5 8 6 3 4 3 2 7 3 3 ...
##  $ Sierra Foothills       : num [1:20] 8 9 7 6 0 12 10 1 7 7 ...
##  $ Point Reyes            : num [1:20] 9 5 2 3 2 9 5 1 6 5 ...
##  $ Lake Tahoe             : num [1:20] 5 5 11 2 3 3 7 5 0 3 ...
##  $ Channel Islands        : num [1:20] 10 2 0 3 12 8 9 4 7 1 ...
##  $ Central Valley Wetlands: num [1:20] 8 7 9 5 10 0 11 1 5 9 ...
##  $ Salton Sea             : num [1:20] 2 4 7 5 1 5 2 2 7 10 ...
##  $ Lassen Volcanic Park   : num [1:20] 2 4 1 10 1 9 13 4 6 1 ...
##  $ Elkhorn Slough         : num [1:20] 2 3 12 0 7 6 4 10 1 6 ...
##  $ Lake Berryessa         : num [1:20] 9 6 6 6 9 3 3 6 9 2 ...

summary(california_species)

##    Species          Yosemite Valley Sierra Foothills  Point Reyes   
##  Length:20          Min.   : 0.00   Min.   : 0.00    Min.   : 0.00  
##  Class :character   1st Qu.: 2.75   1st Qu.: 3.00    1st Qu.: 2.00  
##  Mode  :character   Median : 3.00   Median : 7.00    Median : 5.00  
##                     Mean   : 4.25   Mean   : 6.60    Mean   : 5.05  
##                     3rd Qu.: 6.00   3rd Qu.: 9.25    3rd Qu.: 7.00  
##                     Max.   :14.00   Max.   :13.00    Max.   :13.00  
##    Lake Tahoe    Channel Islands Central Valley Wetlands   Salton Sea   
##  Min.   : 0.00   Min.   : 0.00   Min.   : 0.00           Min.   : 0.00  
##  1st Qu.: 1.75   1st Qu.: 2.75   1st Qu.: 2.75           1st Qu.: 2.00  
##  Median : 3.00   Median : 4.50   Median : 5.00           Median : 5.00  
##  Mean   : 4.35   Mean   : 5.20   Mean   : 5.50           Mean   : 5.40  
##  3rd Qu.: 7.00   3rd Qu.: 7.25   3rd Qu.: 9.00           3rd Qu.: 7.25  
##  Max.   :13.00   Max.   :12.00   Max.   :11.00           Max.   :13.00  
##  Lassen Volcanic Park Elkhorn Slough  Lake Berryessa 
##  Min.   : 1.00        Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 2.00        1st Qu.: 1.75   1st Qu.: 2.75  
##  Median : 4.00        Median : 3.00   Median : 6.00  
##  Mean   : 5.25        Mean   : 3.95   Mean   : 5.40  
##  3rd Qu.: 9.00        3rd Qu.: 6.00   3rd Qu.: 6.75  
##  Max.   :13.00        Max.   :12.00   Max.   :14.00

View(california_species)

# rows = species
# first column = species names (called "Species")
# remaining columns = sites with abundance counts

# Observation: 
# This dataset includes abundances of 21 bird species across 10 California sites,
# spanning coastal, foothill, montane, wetland, and desert-edge habitats. 
# Each row is a species and each column (after the first) is a site. 
# Abundance values are counts of individuals per site. I converted the data 
# from species-by-site format into a site-by-species community matrix for analysis.


### 3. Prepare community matrix ---

# Remove first column (species names), keep numeric abundances
bird_counts <- california_species[, -1]

# Transpose: rows = sites, columns = species
bird_counts_t <- t(as.matrix(bird_counts))
colnames(bird_counts_t) <- california_species$Species
rownames(bird_counts_t) <- colnames(california_species)[-1]

# Community matrix (sites x species)
community_matrix <- bird_counts_t

# Long format for plotting / summaries
community_long <- community_matrix %>%
  as.data.frame() %>%
  mutate(Site = rownames(.)) %>%
  pivot_longer(cols = -Site,
               names_to = "Species",
               values_to = "Abundance")

# I changed the matrix so that rows represent sites and columns represent species, 
# creating a community matrix suitable for vegan. 
# I also created a long-format version (community_long) for
# plotting, where each row is a site–species–abundance combination. 


### 4. Calculate alpha diversity metrics ---

# Species richness per site
richness <- specnumber(community_matrix)

# Shannon Index per site
shannon  <- diversity(community_matrix, index = "shannon")

# Simpson Index  per site
simpson  <- diversity(community_matrix, index = "simpson")

# Combine into one data frame
diversity_results <- data.frame(
  Site     = rownames(community_matrix),
  Richness = richness,
  Shannon  = shannon,
  Simpson  = simpson
)

# This section is where I calculated all alpha-diversity metrics.


### 5. Alpha and gamma set up ---

# alpha_richness = α-diversity per site
# gamma_richness = γ-diversity for the whole region

# Alpha diversity per site (species richness)
alpha_richness <- diversity_results$Richness
names(alpha_richness) <- diversity_results$Site

alpha_richness  # check

##         Yosemite Valley        Sierra Foothills             Point Reyes 
##                      19                      19                      18 
##              Lake Tahoe         Channel Islands Central Valley Wetlands 
##                      18                      19                      18 
##              Salton Sea    Lassen Volcanic Park          Elkhorn Slough 
##                      19                      20                      18 
##          Lake Berryessa 
##                      19

# Gamma diversity = total species richness across all sites combined
# (number of species present in at least one site)
gamma_richness <- specnumber(colSums(community_matrix))

gamma_richness

## [1] 20

View(diversity_results)

# This section takes the richness values already calculated in Section 4
# and reorganizes them to calculate gamma-diversity (total regional
# richness). Later in the script, I calculate beta-diversity using distance measures
# (Jaccard, Sørensen, Bray–Curtis, and Euclidean) and NMDS plots.


### 6. Distribution of Species Abundances Across Sites ---

# Boxplot of species abundance by site
ggplot(community_long, aes(x = Site, y = Abundance, fill = Site)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Distribution of Species Abundances Across Sites",
       x = "Site", y = "Abundance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Observation:
# Sites like Sierra Foothills, Central Valley Wetlands, and Lake Berryessa show
# a wide range of abundances, meaning many species occur at moderate numbers.
# Yosemite Valley and Elkhorn Slough have much narrower spreads, indicating that 
# one or two species make up most of the individuals at those sites.


### 7. Barplots using  ggplot2 (one metric at a time) ---

## Species richness
# ggplot2
ggplot(diversity_results, aes(x = Site, y = Richness)) +
  geom_col(fill = "skyblue") +
  labs(
    title = "Species Richness across Sites",
    x = "Site",
    y = "Species Richness"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

## Shannon index
# ggplot2
ggplot(diversity_results, aes(x = Site, y = Shannon)) +
  geom_col(fill = "lightgreen") +
  labs(
    title = "Shannon Diversity Index across Sites",
    x = "Site",
    y = "Shannon Index"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

## Simpson index
# ggplot2
ggplot(diversity_results, aes(x = Site, y = Simpson)) +
  geom_col(fill = "lightpink") +
  labs(
    title = "Simpson Diversity Index across Sites",
    x = "Site",
    y = "Simpson Index"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

# Observations: 
# Coastal and foothill sites such as Point Reyes, Channel Islands, 
# and Sierra Foothills tend to have higher species richness and Shannon diversity,
# showing both more species and more even distributions. 
# Sites like Yosemite Valley or Lake Tahoe have lower Shannon
# and higher Simpson values (lower diversity), suggesting dominance by a few species. 
# These patterns match the exploratory boxplots of abundance, where coastal 
# and foothill sites show broader spreads and fewer extreme dominants than some
# montane or more isolated sites.


### 8. Combined side-by-side plots using patchwork ---

p1 <- ggplot(diversity_results, aes(x = Site, y = Richness)) +
  geom_col(fill = "lightblue") +
  theme_minimal() +
  labs(title = "Richness", x = "Site", y = "Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(diversity_results, aes(x = Site, y = Shannon)) +
  geom_col(fill = "lightgreen") +
  theme_minimal() +
  labs(title = "Shannon", x = "Site", y = "Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p3 <- ggplot(diversity_results, aes(x = Site, y = Simpson)) +
  geom_col(fill = "lightpink") +
  theme_minimal() +
  labs(title = "Simpson", x = "Site", y = "Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p1 + p2 + p3

# Observation:
# The three side-by-side plots show that overall diversity is fairly similar
# across all sites, but there are small differences. Richness only varies a
# little, with Lassen Volcanic Park having the most species and most other sites
# sitting just below it. Shannon shows a similar trend—sites like Channel
# Islands, Sierra Foothills, and Lake Berryessa have slightly higher values,
# meaning they have both many species and fairly even abundances. Simpson values
# are high and close together for all sites, showing that no site is extremely
# uneven. Sites with slightly lower Simpson values, such as Yosemite Valley or
# Elkhorn Slough, likely have one or two species that dominate more than at
# other sites.


### 9. Combined ggplot2 barplot (all metrics together) ---

# Reshape data to long format for ggplot
div_long <- pivot_longer(
  diversity_results,
  cols = c(Richness, Shannon, Simpson),
  names_to  = "Metric",
  values_to = "Value"
)

# Plot all three metrics across sites
ggplot(div_long, aes(x = Site, y = Value, fill = Metric)) +
  geom_col(position = "dodge") +
  labs(
    title = "Diversity Metrics across Sites",
    x = "Site",
    y = "Diversity Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x  = element_text(angle = 45, hjust = 1),
    plot.title   = element_text(face = "bold", hjust = 0.5)
  )

# Observation:
# The three metrics plotted side by side shows how these sites are similar in 
# overall diversity, but there are still some differences.
# Richness varies the most, with Lassen Volcanic Park reaching the highest
# species count and most other sites sitting just below it. Shannon values follow
# the same pattern: sites like Channel Islands, Sierra Foothills, and Lake
# Berryessa are slightly higher, meaning they have both many species and fairly
# even abundances. Simpson values are high and tightly grouped across all sites,
# showing that no community is extremely uneven, though sites like Yosemite
# Valley and Elkhorn Slough dip a bit lower, suggesting some dominance by one or
# two species. Overall, this combined plot shows that differences among sites
# reflect evenness more than raw species numbers.


### 10. Gini-Simpson Index ---

# Gini-Simpson Index (1 - D)
diversity_results$Gini_Simpson <- 1 - diversity_results$Simpson

# Simpson's Evenness (E = (1/D) / S)
diversity_results$Simpson_Evenness <- (1 / diversity_results$Simpson) / diversity_results$Richness

# Combined plot of all four metrics (Shannon, Simpson, Gini-Simpson, Evenness)
# into one long table. (pivot_longer + all indices together)
# Reshape to long format
diversity_long <- diversity_results %>%
  pivot_longer(cols = c(Shannon, Simpson, Gini_Simpson, Simpson_Evenness),
               names_to = "Metric", values_to = "Value")

ggplot(diversity_long, aes(x = Site, y = Value, fill = Metric)) +
  geom_col(position = "dodge") +
  theme_minimal() +
  labs(title = "Diversity Metrics Across California Sites",
       x = "Site", y = "Index Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Separate Plots (one per metric)
ggplot(diversity_results, aes(x = Site, y = Gini_Simpson, fill = Site)) +
  geom_col(show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Gini-Simpson Index across Sites", x = "Site", y = "Gini-Simpson") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(diversity_results, aes(x = Site, y = Simpson_Evenness, fill = Site)) +
  geom_col(show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Simpson's Evenness across Sites", x = "Site", y = "Evenness") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Observation:
# The Gini-Simpson Index shows that most sites have similar overall diversity.
# Coastal and foothill sites (like Sierra Foothills, Channel Islands, and Lake Berryessa)
# have slightly higher values, meaning many species contribute fairly evenly to the total
# abundance. In other words, these communities are diverse and not dominated by just one
# or two species.

# Simpson’s Evenness values are very similar across sites, but a few stand out.
# Sites like Sierra Foothills, Channel Islands, and Point Reyes have higher evenness,
# meaning their species are more evenly balanced in abundance (no single species dominates).
# Sites like Elkhorn Slough and Lake Tahoe are a bit lower, meaning a few species make up
# a larger share of the community.

# Shannon’s Index shows the same general pattern as the other metrics. It is highest at
# sites that have many species and where those species are fairly even in abundance
# (such as Sierra Foothills and Channel Islands). It is lower at sites where one or two
# species dominate the community (like Yosemite Valley or Elkhorn Slough).


### 11. Statistical comparison- ANOVA ---

# Even without replicates, I want to do an ANOVA test to show why it is not appropriate here.
anova_result<-aov(Shannon~Site,data=diversity_results)
summary(anova_result)

##             Df  Sum Sq  Mean Sq
## Site         9 0.02407 0.002674

# Observation: 
# There were no replicate observations (each site has only one Shannon value),
# so ANOVA is not applicable here. Visual comparison of diversity indices across sites is 
# the better way to go about this dataset.


### 12. Jaccard and Sørensen Indices for Beta Diversity ---

# Convert to presence–absence (1 = present, 0 = absent)
comm_pa <- (community_matrix > 0) * 1

# Jaccard dissimilarity (1 - Jaccard similarity)
jaccard_dist <- vegdist(comm_pa, method = "jaccard", binary = TRUE)

# Sørensen (Bray-Curtis on presence–absence gives quantitative Sørensen)
sorensen_dist <- vegdist(comm_pa, method = "bray", binary = TRUE)

# Inspect distance matrices
as.matrix(jaccard_dist)

##                         Yosemite Valley Sierra Foothills Point Reyes Lake Tahoe
## Yosemite Valley                    0.00             0.10  0.15000000 0.15000000
## Sierra Foothills                   0.10             0.00  0.15000000 0.15000000
## Point Reyes                        0.15             0.15  0.00000000 0.20000000
## Lake Tahoe                         0.15             0.15  0.20000000 0.00000000
## Channel Islands                    0.10             0.10  0.15000000 0.15000000
## Central Valley Wetlands            0.15             0.15  0.20000000 0.20000000
## Salton Sea                         0.10             0.10  0.15000000 0.05263158
## Lassen Volcanic Park               0.05             0.05  0.10000000 0.10000000
## Elkhorn Slough                     0.15             0.15  0.10526316 0.20000000
## Lake Berryessa                     0.10             0.10  0.05263158 0.15000000
##                         Channel Islands Central Valley Wetlands Salton Sea
## Yosemite Valley                    0.10                    0.15 0.10000000
## Sierra Foothills                   0.10                    0.15 0.10000000
## Point Reyes                        0.15                    0.20 0.15000000
## Lake Tahoe                         0.15                    0.20 0.05263158
## Channel Islands                    0.00                    0.15 0.10000000
## Central Valley Wetlands            0.15                    0.00 0.15000000
## Salton Sea                         0.10                    0.15 0.00000000
## Lassen Volcanic Park               0.05                    0.10 0.05000000
## Elkhorn Slough                     0.15                    0.20 0.15000000
## Lake Berryessa                     0.10                    0.15 0.10000000
##                         Lassen Volcanic Park Elkhorn Slough Lake Berryessa
## Yosemite Valley                         0.05     0.15000000     0.10000000
## Sierra Foothills                        0.05     0.15000000     0.10000000
## Point Reyes                             0.10     0.10526316     0.05263158
## Lake Tahoe                              0.10     0.20000000     0.15000000
## Channel Islands                         0.05     0.15000000     0.10000000
## Central Valley Wetlands                 0.10     0.20000000     0.15000000
## Salton Sea                              0.05     0.15000000     0.10000000
## Lassen Volcanic Park                    0.00     0.10000000     0.05000000
## Elkhorn Slough                          0.10     0.00000000     0.05263158
## Lake Berryessa                          0.05     0.05263158     0.00000000

as.matrix(sorensen_dist)

##                         Yosemite Valley Sierra Foothills Point Reyes Lake Tahoe
## Yosemite Valley              0.00000000       0.05263158  0.08108108 0.08108108
## Sierra Foothills             0.05263158       0.00000000  0.08108108 0.08108108
## Point Reyes                  0.08108108       0.08108108  0.00000000 0.11111111
## Lake Tahoe                   0.08108108       0.08108108  0.11111111 0.00000000
## Channel Islands              0.05263158       0.05263158  0.08108108 0.08108108
## Central Valley Wetlands      0.08108108       0.08108108  0.11111111 0.11111111
## Salton Sea                   0.05263158       0.05263158  0.08108108 0.02702703
## Lassen Volcanic Park         0.02564103       0.02564103  0.05263158 0.05263158
## Elkhorn Slough               0.08108108       0.08108108  0.05555556 0.11111111
## Lake Berryessa               0.05263158       0.05263158  0.02702703 0.08108108
##                         Channel Islands Central Valley Wetlands Salton Sea
## Yosemite Valley              0.05263158              0.08108108 0.05263158
## Sierra Foothills             0.05263158              0.08108108 0.05263158
## Point Reyes                  0.08108108              0.11111111 0.08108108
## Lake Tahoe                   0.08108108              0.11111111 0.02702703
## Channel Islands              0.00000000              0.08108108 0.05263158
## Central Valley Wetlands      0.08108108              0.00000000 0.08108108
## Salton Sea                   0.05263158              0.08108108 0.00000000
## Lassen Volcanic Park         0.02564103              0.05263158 0.02564103
## Elkhorn Slough               0.08108108              0.11111111 0.08108108
## Lake Berryessa               0.05263158              0.08108108 0.05263158
##                         Lassen Volcanic Park Elkhorn Slough Lake Berryessa
## Yosemite Valley                   0.02564103     0.08108108     0.05263158
## Sierra Foothills                  0.02564103     0.08108108     0.05263158
## Point Reyes                       0.05263158     0.05555556     0.02702703
## Lake Tahoe                        0.05263158     0.11111111     0.08108108
## Channel Islands                   0.02564103     0.08108108     0.05263158
## Central Valley Wetlands           0.05263158     0.11111111     0.08108108
## Salton Sea                        0.02564103     0.08108108     0.05263158
## Lassen Volcanic Park              0.00000000     0.05263158     0.02564103
## Elkhorn Slough                    0.05263158     0.00000000     0.02702703
## Lake Berryessa                    0.02564103     0.02702703     0.00000000

# Visualize
# Jaccard dendrogram - which sites share similar species presence.
# Cluster dendrograms based on Jaccard and Sørensen
plot(hclust(jaccard_dist),
     main = "Cluster Dendrogram (Jaccard Dissimilarity)",
     xlab = "", sub = "")

plot(hclust(sorensen_dist),
     main = "Cluster Dendrogram (Sørensen Dissimilarity)",
     xlab = "", sub = "")

# Observation:
# The Jaccard and Sørensen dendrograms show which sites have similar species lists. 
# Lake Berryessa and Point Reyes cluster closely, meaning they share many of the same 
# coastal/foothill species. Yosemite Valley and Lassen Volcanic Park form a montane pair, 
# while Channel Islands and Sierra Foothills also group together as relatively similar sites. 
# Central Valley Wetlands appears on its own branch, showing that its wetland bird community 
# is more distinct from the other sites.


### 13. Calculate Bray–Curtis dissimilarity ---

bray_dist <- vegdist(community_matrix, method = "bray")

# View the distance matrix (optional)
as.matrix(bray_dist)

##                         Yosemite Valley Sierra Foothills Point Reyes Lake Tahoe
## Yosemite Valley               0.0000000        0.3456221   0.4301075  0.3488372
## Sierra Foothills              0.3456221        0.0000000   0.2961373  0.4337900
## Point Reyes                   0.4301075        0.2961373   0.0000000  0.4361702
## Lake Tahoe                    0.3488372        0.4337900   0.4361702  0.0000000
## Channel Islands               0.3968254        0.3305085   0.2780488  0.3926702
## Central Valley Wetlands       0.3846154        0.2892562   0.4312796  0.4213198
## Salton Sea                    0.4093264        0.2833333   0.3492823  0.3846154
## Lassen Volcanic Park          0.5473684        0.3586498   0.3203883  0.3854167
## Elkhorn Slough                0.4268293        0.4881517   0.4333333  0.3614458
## Lake Berryessa                0.3782383        0.4083333   0.2822967  0.3948718
##                         Channel Islands Central Valley Wetlands Salton Sea
## Yosemite Valley               0.3968254               0.3846154  0.4093264
## Sierra Foothills              0.3305085               0.2892562  0.2833333
## Point Reyes                   0.2780488               0.4312796  0.3492823
## Lake Tahoe                    0.3926702               0.4213198  0.3846154
## Channel Islands               0.0000000               0.3738318  0.3867925
## Central Valley Wetlands       0.3738318               0.0000000  0.3119266
## Salton Sea                    0.3867925               0.3119266  0.0000000
## Lassen Volcanic Park          0.3301435               0.4232558  0.3239437
## Elkhorn Slough                0.4098361               0.4497354  0.4010695
## Lake Berryessa                0.2924528               0.3211009  0.3518519
##                         Lassen Volcanic Park Elkhorn Slough Lake Berryessa
## Yosemite Valley                    0.5473684      0.4268293      0.3782383
## Sierra Foothills                   0.3586498      0.4881517      0.4083333
## Point Reyes                        0.3203883      0.4333333      0.2822967
## Lake Tahoe                         0.3854167      0.3614458      0.3948718
## Channel Islands                    0.3301435      0.4098361      0.2924528
## Central Valley Wetlands            0.4232558      0.4497354      0.3211009
## Salton Sea                         0.3239437      0.4010695      0.3518519
## Lassen Volcanic Park               0.0000000      0.4891304      0.3521127
## Elkhorn Slough                     0.4891304      0.0000000      0.4010695
## Lake Berryessa                     0.3521127      0.4010695      0.0000000

# Create cluster dendrogram
# Cluster analysis using hierarchical clustering
bray_cluster <- hclust(bray_dist, method = "average")  # "average" = UPGMA linkage

# Plot the dendrogram
plot(bray_cluster,
     main = "Cluster Dendrogram (Bray–Curtis Dissimilarity)",
     xlab = "Sites",
     sub = "",
     col.main = "darkgreen")

# NMDS plots using Bray–Curtis
set.seed(123)
nmds_bray <- metaMDS(community_matrix, distance = "bray", k = 2, trymax = 100)

## Wisconsin double standardization
## Run 0 stress 0.159 
## Run 1 stress 0.2379666 
## Run 2 stress 0.1725648 
## Run 3 stress 0.1781684 
## Run 4 stress 0.1662069 
## Run 5 stress 0.2244295 
## Run 6 stress 0.229182 
## Run 7 stress 0.159 
## ... New best solution
## ... Procrustes: rmse 4.21791e-05  max resid 9.191029e-05 
## ... Similar to previous best
## Run 8 stress 0.1614267 
## Run 9 stress 0.2211794 
## Run 10 stress 0.1781685 
## Run 11 stress 0.2836832 
## Run 12 stress 0.1656401 
## Run 13 stress 0.197652 
## Run 14 stress 0.1574191 
## ... New best solution
## ... Procrustes: rmse 0.1350303  max resid 0.2927421 
## Run 15 stress 0.1514486 
## ... New best solution
## ... Procrustes: rmse 0.09328574  max resid 0.2090452 
## Run 16 stress 0.159 
## Run 17 stress 0.2170053 
## Run 18 stress 0.1781685 
## Run 19 stress 0.1999718 
## Run 20 stress 0.1983177 
## Run 21 stress 0.1590001 
## Run 22 stress 0.1514486 
## ... Procrustes: rmse 2.407752e-05  max resid 5.143648e-05 
## ... Similar to previous best
## *** Best solution repeated 1 times

scores_bray <- as.data.frame(scores(nmds_bray, display = "sites"))
scores_bray$Site <- rownames(community_matrix)

ggplot(scores_bray, aes(x = NMDS1, y = NMDS2, label = Site)) +
  geom_point() +
  geom_text(vjust = -0.5, size = 3) +
  theme_minimal() +
  labs(title = "NMDS (Bray–Curtis, all species)",
       x = "NMDS1", y = "NMDS2")

# NMDS using Jaccard (presence–absence)
comm_pa <- (community_matrix > 0) * 1
set.seed(123)
nmds_jaccard <- metaMDS(comm_pa, distance = "jaccard", k = 2, trymax = 100, binary = TRUE)

## Run 0 stress 0.03583591 
## Run 1 stress 0.03583592 
## ... Procrustes: rmse 0.1945133  max resid 0.3691144 
## Run 2 stress 0.03934665 
## Run 3 stress 0.03583581 
## ... New best solution
## ... Procrustes: rmse 0.08903071  max resid 0.1987291 
## Run 4 stress 0.03934637 
## Run 5 stress 0.03583583 
## ... Procrustes: rmse 0.1945205  max resid 0.3689705 
## Run 6 stress 0.03934652 
## Run 7 stress 0.1378718 
## Run 8 stress 0.03934633 
## Run 9 stress 0.03583593 
## ... Procrustes: rmse 0.2019919  max resid 0.3829852 
## Run 10 stress 0.03863904 
## Run 11 stress 0.03934653 
## Run 12 stress 0.03583605 
## ... Procrustes: rmse 0.1945298  max resid 0.3692326 
## Run 13 stress 0.03583864 
## ... Procrustes: rmse 0.09875127  max resid 0.2221792 
## Run 14 stress 0.03599741 
## ... Procrustes: rmse 0.1006874  max resid 0.2241908 
## Run 15 stress 0.03864122 
## Run 16 stress 0.140182 
## Run 17 stress 0.03583583 
## ... Procrustes: rmse 0.190498  max resid 0.3700431 
## Run 18 stress 0.140352 
## Run 19 stress 0.03583601 
## ... Procrustes: rmse 0.1722469  max resid 0.3749741 
## Run 20 stress 0.03583599 
## ... Procrustes: rmse 0.09864041  max resid 0.2222466 
## Run 21 stress 0.138227 
## Run 22 stress 0.03583609 
## ... Procrustes: rmse 0.1722271  max resid 0.3753714 
## Run 23 stress 0.03583588 
## ... Procrustes: rmse 0.1943861  max resid 0.3689731 
## Run 24 stress 0.03934661 
## Run 25 stress 0.03583595 
## ... Procrustes: rmse 0.0004854404  max resid 0.001107759 
## ... Similar to previous best
## *** Best solution repeated 1 times

scores_jaccard <- as.data.frame(scores(nmds_jaccard, display = "sites"))
scores_jaccard$Site <- rownames(community_matrix)

ggplot(scores_jaccard, aes(x = NMDS1, y = NMDS2, label = Site)) +
  geom_point() +
  geom_text(vjust = -0.5, size = 3) +
  theme_minimal() +
  labs(title = "NMDS (Jaccard, presence–absence)",
       x = "NMDS1", y = "NMDS2")

# Observation:
# Under Bray–Curtis NMDS, sites group together based on how similar their abundance patterns are.
# Coastal and foothill sites (Point Reyes, Channel Islands, Sierra Foothills) cluster because they
# share similar mixes of common and rare species. Montane sites (Yosemite Valley, Lassen Volcanic
# Park, Lake Tahoe) form their own cluster, reflecting high-elevation bird communities.
# The Jaccard NMDS, which uses only presence–absence data, shows a similar coastal vs. montane
# separation, but it makes sites appear more similar than they really are because it ignores how
# many individuals of each species are present. Sites that share species but in very different
# abundances get compressed together.


### 14. Abundance Curves ---

# Plot Rank–Abundance (Whittaker) Curves for all sites
radcurves <- radfit(community_matrix)  # fits rank–abundance models (optional)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge

## Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  : 
##   NA/NaN/Inf in 'x'

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge

## Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  : 
##   NA/NaN/Inf in 'x'

## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge

# Basic Whittaker plot for all sites
rankabundanceplot <- function(mat) {
  par(mfrow = c(2, 3))  # adjust layout for multiple sites
  for (i in 1:nrow(mat)) {
    site_name <- rownames(mat)[i]
    site_data <- as.numeric(mat[i, ])
    site_data <- site_data[site_data > 0]  # remove zeros
    site_data <- sort(site_data, decreasing = TRUE)
    rank <- 1:length(site_data)
    plot(rank, site_data, log = "y", type = "b", pch = 16, col = "darkblue",
         main = paste("Rank–Abundance Curve:", site_name),
         xlab = "Species Rank", ylab = "Abundance (log scale)")
  }
  par(mfrow = c(1, 1))
}

rankabundanceplot(community_matrix)

# Observation:
# The rank–abundance curves show how evenly species are distributed across sites. 
# Sites like Lake Berryessa, Sierra Foothills, Channel Islands, and Lassen Volcanic Park 
# have gradual curves, meaning many species occur in similar numbers and no single species dominates. 
# In contrast, sites such as Elkhorn Slough, Salton Sea, Yosemite Valley, and Lake Tahoe 
# have steep drops, meaning one or two species are much more abundant than the rest. 
# This indicates lower evenness and stronger environmental filtering at those sites. 
# The curves support earlier results that coastal and foothill sites tend to have 
# more balanced communities, while montane, wetland, and desert-edge sites often have 
# a few highly dominant species.


### 15. Euclidean distance and dendrogram ---

# Calculate Euclidean distance
euclidean_dist <- dist(community_matrix, method = "euclidean")

# View distance matrix
as.matrix(euclidean_dist)

##                         Yosemite Valley Sierra Foothills Point Reyes Lake Tahoe
## Yosemite Valley                 0.00000         20.07486    21.26029   18.27567
## Sierra Foothills               20.07486          0.00000    19.97498   24.39262
## Point Reyes                    21.26029         19.97498     0.00000   22.31591
## Lake Tahoe                     18.27567         24.39262    22.31591    0.00000
## Channel Islands                19.97498         21.72556    15.84298   21.61018
## Central Valley Wetlands        20.51828         21.44761    24.35159   22.47221
## Salton Sea                     21.28380         19.54482    20.07486   21.33073
## Lassen Volcanic Park           27.05550         23.64318    19.13113   21.35416
## Elkhorn Slough                 18.92089         25.94224    21.63331   17.08801
## Lake Berryessa                 21.74856         25.84570    16.09348   21.09502
##                         Channel Islands Central Valley Wetlands Salton Sea
## Yosemite Valley                19.97498                20.51828   21.28380
## Sierra Foothills               21.72556                21.44761   19.54482
## Point Reyes                    15.84298                24.35159   20.07486
## Lake Tahoe                     21.61018                22.47221   21.33073
## Channel Islands                 0.00000                20.88061   23.02173
## Central Valley Wetlands        20.88061                 0.00000   21.02380
## Salton Sea                     23.02173                21.02380    0.00000
## Lassen Volcanic Park           20.22375                25.19921   22.11334
## Elkhorn Slough                 20.95233                21.23676   20.17424
## Lake Berryessa                 16.12452                20.29778   21.72556
##                         Lassen Volcanic Park Elkhorn Slough Lake Berryessa
## Yosemite Valley                     27.05550       18.92089       21.74856
## Sierra Foothills                    23.64318       25.94224       25.84570
## Point Reyes                         19.13113       21.63331       16.09348
## Lake Tahoe                          21.35416       17.08801       21.09502
## Channel Islands                     20.22375       20.95233       16.12452
## Central Valley Wetlands             25.19921       21.23676       20.29778
## Salton Sea                          22.11334       20.17424       21.72556
## Lassen Volcanic Park                 0.00000       24.57641       20.12461
## Elkhorn Slough                      24.57641        0.00000       19.67232
## Lake Berryessa                      20.12461       19.67232        0.00000

# Cluster dendrogram
plot(hclust(euclidean_dist),
     main = "Cluster Dendrogram (Euclidean Distance)",
     xlab = "Sites",
     sub = "",
     col.main = "darkblue")

# Observation:
# The Euclidean dendrogram groups sites based on total abundance, not actual ecological similarity. 
# Because Euclidean distance is heavily influenced by overall counts, sites with similar 
# total numbers of birds get clustered together even if they have very different species. 
# This is why Yosemite Valley, Lake Tahoe, and Elkhorn Slough cluster—they have similar overall 
# abundance levels, not similar species communities. 
# Coastal and foothill sites (Point Reyes, Channel Islands, Lake Berryessa) also appear near each other, 
# but this grouping is less meaningful ecologically. 
# Overall, Euclidean distance is not ideal for community data because it ignores species identity 
# and mainly reflects differences in total abundance rather than true community structure.


### 16. All three dendrograms side-by-side (Euclidean, Bray-Curtis, Jaccard)---

bray_dist <- vegdist(community_matrix, method = "bray")
jaccard_dist <- vegdist(community_matrix, method = "jaccard", binary = TRUE)

par(mfrow = c(1, 3))
plot(hclust(euclidean_dist), main = "Euclidean", xlab = "", sub = "")
plot(hclust(bray_dist), main = "Bray–Curtis", xlab = "", sub = "")
plot(hclust(jaccard_dist), main = "Jaccard", xlab = "", sub = "")

par(mfrow = c(1, 1))

# Observation:
# Looking at the three dendrograms side-by-side shows that each distance measure highlights 
# different parts of community structure. The Euclidean dendrogram looks very different from 
# the other two because it mainly reflects total abundance—so sites with similar overall bird 
# counts (like Yosemite Valley, Lake Tahoe, and Elkhorn Slough) cluster together even if their 
# species compositions are not similar. 

# Bray–Curtis clusters sites that have similar species and similar abundances. 
# It shows a clear ecological pattern: the coastal and foothill sites 
# (Point Reyes, Channel Islands, Lake Berryessa) group together because they share 
# many of the same bird species in similar numbers. In contrast, the montane sites 
# (Yosemite Valley, Lassen Volcanic Park, Lake Tahoe) cluster separately, reflecting 
# their distinct high-elevation bird communities.

# The Jaccard dendrogram, which uses only presence–absence, shows a similar general structure 
# but flattens differences among sites that share species but in different abundances. 

# Bray–Curtis is the most informative here because it captures both which species 
# are present and how abundant they are, while Euclidean is the least appropriate for 
# community ecology since it mostly reflects total counts.


### 17. Final Interpretation ---

# Across all analyses—diversity metrics, abundance plots, dendrograms, and NMDS—
# the 10 California bird communities show consistent patterns.

# The abundance boxplots show major differences among sites. Sierra Foothills,
# Central Valley Wetlands, and Lake Berryessa have a wide spread of species
# abundances, meaning many species occur at moderate numbers. These sites are
# more even, with no single species dominating. On the other hand, Yosemite Valley,
# Elkhorn Slough, and sometimes Lake Tahoe have much tighter distributions, where
# one or two species are very common. This shows lower evenness and stronger
# environmental filtering, where only a few species thrive.

# The Jaccard dendrogram shows which sites share similar species lists.
# Lake Berryessa and Point Reyes cluster together, meaning they have very 
# similar coastal/foothill bird communities. Yosemite Valley and Lassen 
# Volcanic Park form a pair because they share montane (high-elevation) species.
# Central Valley Wetlands stands alone on its own branch, showing its species
# list is very different from the others—likely because wetland habitats support
# unique bird species.

# When looking at richness and evenness together, the coastal and foothill sites 
# (Point Reyes, Channel Islands, Sierra Foothills, Lake Berryessa) tend to have 
# both more species and more balanced abundances. These areas have moderate 
# climates and varied vegetation, which make it easier for many species to coexist.
# While extreme or isolated habitats, like high-elevation areas, 
# wetlands, or desert-edge site, usually have fewer species or communities that 
# are dominated by just a few specialists. Environmental stress limits which species can survive there.

# Coastal and foothill sites contribute a lot to regional diversity because they 
# support many species. At the same time, more specialized habitats (like 
# Yosemite Valley, Lassen Volcanic Park, and Central Valley Wetlands) contain
# unique species that are not found in other areas. This means California needs
# to protect both the species-rich habitats and the more specialized ones.
# Keeping habitats connected across the landscape is especially important so 
# species can move, migrate, and persist over time.

# For beta diversity, Bray–Curtis is the most helpful metric for this dataset.
# It uses abundance information, so it captures both which species are present
# and how common they are. Jaccard is still useful for showing differences in species
# composition, but it ignores abundances. Euclidean distance is the least useful
# because it mostly reflects total counts, not real ecological similarity.

# The alpha-diversity metrics (Shannon, Simpson, Gini-Simpson, and Evenness) help
# explain both how many species a site has and how evenly those species are distributed.
# Shannon is sensitive to rare species, while Gini-Simpson focuses more on common ones.
# Simpson’s Evenness shows how balanced the abundances are. Together, these metrics show
# that coastal and foothill sites tend to be both species-rich and evenly balanced, while
# montane, wetland, and desert-edge sites often have fewer species or communities where
# a few species dominate.

# All these results show that coastal and foothill sites tend to have
# both high species richness and balanced abundances, making them the most
# diverse overall. Sites that are more isolated or environmentally
# extreme often have fewer species or are dominated by just a few. Abundance-based
# metrics like Bray–Curtis and Shannon describe these differences the best and
# give the clearest ecological picture of how these bird communities differ.

FINAL-R-script.R

ximen

2025-11-22