Stomachs EDA

Importing & Cleaning

#install.packages("tidyverse")
#install.packages("janitor")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)

Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test
library(lubridate)

rm(list = ls(envir = .GlobalEnv), envir = .GlobalEnv)

sd <- read_csv("stomachdata.csv")
Rows: 884 Columns: 23
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (14): ID, Date Collected, FishSpp, Date_sort, Sorter, Order, Family, Ter...
dbl  (8): Reach, Sample, fish_TL_mm, grams, Digestion_code, ID Hrs, Count, d...
lgl  (1): weight

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sd <- clean_names(sd)

sd_clean <- sd %>% 
  select(-c('matthews_notes')) %>% #removing dumb columns
  mutate(
    across(where(is.character), tolower
           )) %>% 
  mutate(
    family = case_when(
    order == "araneae" ~ "terrestrial", 
    family == "staphylinidae" ~ "terrestrial",
    order == "noinverts" ~ "noinverts",
    is.na(family) ~ "unk",
      TRUE ~ family
    )) %>%  #if family wasnt entered = unknown unless nI, and staphylinidae are terrestrial
   mutate(
    order = case_when(
    order == "nematode" ~ "nematomorpha", 
    order == "unknown" ~ "unk",
    is.na(order) ~ "unk",
      TRUE ~ order
    ))  %>% #analysings nematodes and nematomorphs as one, fixing unk  
  mutate(
    terrestrial = case_when(
      terrestrial == "y" ~ 1,
      terrestrial == "t" ~ 1,
      terrestrial == "n" ~ 0,
    )) %>% #changings columns for y/n to 1/0/NA
  mutate(
    whole = case_when(
      whole == "y" ~ 1,
      whole == "n" ~ 0,
      whole == "h" ~ 0,
      whole == "?" ~ NA
    )) %>% #changings columns for y/n to 1/0/NA
  mutate(
    dried = case_when(
      is.na(dried) ~ 0,
      TRUE ~ dried
    )) %>% #changings columns for y/n to 1/0/NA
  mutate(
    date_collected = mdy(date_collected), #date into lubridate compatible
    year = year(date_collected) #adding year column for later comparisons
    ) %>% 
  rename(
    n = count
  )

#making sure we dont have duplicate orders and species names
unique(sd_clean$order)
 [1] "noinverts"     "hymenoptera"   "diptera"       "ephemeroptera"
 [5] "nematomorpha"  "nematoda"      "trichoptera"   "plecoptera"   
 [9] "coleoptera"    "unk"           "lepidoptera"   "hemiptera"    
[13] "megaloptera"   "annelida"      "acari"         "orthoptera"   
[17] "araneae"       "odonata"       "thirps"        "crustacea"    
unique(sd_clean$family)
 [1] "noinverts"       "terrestrial"     "chironomidae"    "unk"            
 [5] "ephemerellidae"  "heptageniidae"   "baetidae"        "rhyacophilidae" 
 [9] "peltoperlidae"   "chloroperlidae"  "simuliidae"      "nematoda"       
[13] "elmidae"         "pediciidae"      "perlidae"        "perlodidae"     
[17] "hydropsychidae"  "sialidae"        "leuctridae"      "leptophlebiidae"
[21] "tipulidae"       "ceratopogonidae" "brachycentridae" "empididae"      
[25] "ameletidae"      "dixidae"         "nemouridae"      "ostracod"       
[29] "limoniidae"      "apataniidae"     "thremmatidae"   
unique(sd_clean$fish_spp)
[1] "lct" "bk"  NA   

Basic Tables & Graphs

###Bug counts by order for each species x year###
order_count_bk_2022 <- sd_clean %>% 
  filter(fish_spp == "bk",
         year == "2022") %>% 
  group_by(order) %>% 
  summarize(n = sum(n, na.rm = T))

order_count_lct_2022 <- sd_clean %>% 
  filter(fish_spp == "lct",
         year == "2022") %>% 
  group_by(order) %>% 
  summarize(n = sum(n, na.rm = T))  

order_count_lct_2025 <- sd_clean %>% 
  filter(fish_spp == "lct",
         year == "2025") %>% 
  group_by(order) %>% 
  summarize(n = sum(n, na.rm = T))

sum(order_count_bk_2022$n)
[1] 2771
sum(order_count_lct_2022$n)
[1] 4966
sum(order_count_lct_2025$n)
[1] 266
####Families in 2022###
family_count_2022 <- sd_clean %>% 
  filter(year == "2022") %>% 
  group_by(family) %>% 
  summarize(n = sum(n, na.rm = T))

###How many individuals###
unique_fish_bk_2022 <- sd_clean %>%
  filter(fish_spp == "bk",
         year == "2022") %>% 
  group_by(sample) %>%
  summarize(
    id = first(id),
    sample = first(sample),
    total_count = sum(`n`, na.rm = TRUE),
    tail_length = first(fish_tl_mm),
    digestion_code = first(digestion_code),
    .groups = "drop"
  )

unique_fish_lct_2025 <- sd_clean %>%
  filter(fish_spp == "lct",
         year == "2025") %>% 
  group_by(sample) %>%
  summarize(
    id = first(id),
    sample = first(sample),
    total_count = sum(`n`, na.rm = TRUE),
    tail_length = first(fish_tl_mm),
    digestion_code = first(digestion_code),
    .groups = "drop"
  )

unique_fish_lct_2022 <- sd_clean %>%
  filter(fish_spp == "lct",
         year == "2022") %>% 
  group_by(id) %>%
  summarize(
    id = first(id),
    sample = first(sample),
    total_count = sum(`n`, na.rm = TRUE),
    tail_length = first(fish_tl_mm),
    year = first(year),
    digestion_code = first(digestion_code),
    .groups = "drop"
  )

###All Unique Fish in One Place###
unique_fish <- sd_clean %>%
  group_by(id) %>%
  filter(fish_spp %in% c('lct', 'bk')) %>% 
  summarize(
    id = first(id),
    total_count = sum(`n`, na.rm = TRUE),
    fish_spp = first(fish_spp),
    tail_length = first(fish_tl_mm),
    year = first(year),
    digestion_code = first(digestion_code),
    grams = first(grams),
    .groups = "drop"
  )

unique_fish_lct <- sd_clean %>%
  filter(fish_spp == "lct") %>% 
  group_by(id) %>%
  summarize(
    id = first(id),
    total_count = sum(`n`, na.rm = TRUE),
    tail_length = first(fish_tl_mm),
    year = first(year),
    digestion_code = first(digestion_code),
    .groups = "drop"
  )

###How do Total counts vary across year###
ggplot(unique_fish_lct, aes(x = as.factor(year), y = total_count)) +
  geom_boxplot(fill = "lightblue", outlier.shape = NA) +  # boxplot
  geom_jitter(width = 0.2, size = 2, color = "darkblue", alpha = 0.7) +  # individual points
  labs(
    title = "Total Count Inverts by Year for LCT",
    x = "Year",
    y = "Total Count Inverts"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13)
  )

ggsave("count_by_year.png",
      width = 12,       # width in inches
       height = 8,       # taller height
      dpi = 300)        # high-res

unique_fish %>% 
  filter(year == "2025",
         fish_spp == "lct") %>%
  summary()
      id             total_count       fish_spp          tail_length   
 Length:30          Min.   : 0.000   Length:30          Min.   :112.0  
 Class :character   1st Qu.: 4.250   Class :character   1st Qu.:153.2  
 Mode  :character   Median : 7.000   Mode  :character   Median :166.5  
                    Mean   : 8.867                      Mean   :179.2  
                    3rd Qu.:13.000                      3rd Qu.:203.2  
                    Max.   :27.000                      Max.   :266.0  
                                                                       
      year      digestion_code      grams     
 Min.   :2025   Min.   :2.000   Min.   : 9.0  
 1st Qu.:2025   1st Qu.:4.000   1st Qu.: 9.0  
 Median :2025   Median :4.000   Median :25.0  
 Mean   :2025   Mean   :3.885   Mean   :22.2  
 3rd Qu.:2025   3rd Qu.:4.000   3rd Qu.:30.0  
 Max.   :2025   Max.   :5.000   Max.   :38.0  
                NA's   :4       NA's   :25    
unique_fish %>% 
  filter(year == "2022",
         fish_spp == "lct") %>%
  summary()
      id             total_count      fish_spp          tail_length    
 Length:36          Min.   :  0.0   Length:36          Min.   : 67.00  
 Class :character   1st Qu.: 42.5   Class :character   1st Qu.: 89.75  
 Mode  :character   Median : 77.5   Mode  :character   Median :104.00  
                    Mean   :137.9                      Mean   :120.47  
                    3rd Qu.:182.5                      3rd Qu.:135.25  
                    Max.   :519.0                      Max.   :295.00  
                                                                       
      year      digestion_code      grams       
 Min.   :2022   Min.   :1.000   Min.   :  2.00  
 1st Qu.:2022   1st Qu.:2.000   1st Qu.:  6.75  
 Median :2022   Median :3.000   Median :  9.00  
 Mean   :2022   Mean   :2.911   Mean   : 27.08  
 3rd Qu.:2022   3rd Qu.:4.000   3rd Qu.: 24.25  
 Max.   :2022   Max.   :5.000   Max.   :205.00  
                NA's   :5                       
unique_fish %>% 
  filter(year == "2022",
         fish_spp == "bk") %>%
  summary()
      id             total_count       fish_spp          tail_length   
 Length:32          Min.   :  0.00   Length:32          Min.   : 64.0  
 Class :character   1st Qu.: 15.50   Class :character   1st Qu.: 76.0  
 Mode  :character   Median : 42.00   Mode  :character   Median :144.5  
                    Mean   : 86.59                      Mean   :131.8  
                    3rd Qu.:130.50                      3rd Qu.:176.0  
                    Max.   :517.00                      Max.   :200.0  
                                                                       
      year      digestion_code      grams      
 Min.   :2022   Min.   :1.000   Min.   : 2.00  
 1st Qu.:2022   1st Qu.:2.750   1st Qu.: 4.00  
 Median :2022   Median :3.000   Median :28.50  
 Mean   :2022   Mean   :3.032   Mean   :33.09  
 3rd Qu.:2022   3rd Qu.:3.500   3rd Qu.:56.50  
 Max.   :2022   Max.   :4.000   Max.   :86.00  
                NA's   :1                      

Basic Biodiversity Visualizations

#What Families Did we See in LCT & BK 2022
ggplot(family_count_2022, aes(x = reorder(family, -n), y = n)) +
  geom_col(fill = "steelblue") +
  labs(
    title = "Taxonomic Family Quantity - LCT & BK 2022",
    x = "Family",
    y = "Count"
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 50, hjust = 1),
    plot.title = element_text(face = "bold")
  )

ggsave("families_2022.png",
      width = 12,       # width in inches
       height = 8,       # taller height
       dpi = 300) 

#What Orders Did We See in LCT22
order_count_lct_2022 %>% filter(
  order != "noinverts"
) %>% 
ggplot(aes(x = reorder(order, -n), y = n)) +
  geom_col(fill = "steelblue") +
  labs(
    title = "Taxonomic Order Quantity - LCT 2022",
    x = "Order",
    y = "Count"
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 50, hjust = 1),
    plot.title = element_text(face = "bold")
  )

ggsave("orders_lct_2022.png",
     width = 12,       # width in inches
      height = 8,       # taller height
       dpi = 300) 

# Summarize counts at family × species level
family_count_2022_spp <- sd_clean %>% 
  filter(year == "2022") %>% 
  group_by(fish_spp, family) %>% 
  summarize(n = sum(n, na.rm = T))
`summarise()` has grouped output by 'fish_spp'. You can override using the
`.groups` argument.
family_count_2022_summary <- family_count_2022_spp %>%
  group_by(family, fish_spp) %>%
  summarize(n = sum(n, na.rm = TRUE), .groups = "drop") %>%
  group_by(family) %>%
  mutate(total_n = sum(n)) %>%
  mutate(normalized = ifelse(fish_spp == "lct", n / 4966, n / 2846)) %>% 
  ungroup() 

# Plot stacked bar chart
family_count_2022_summary %>% 
  filter(family %in% c('chironomidae', 'baetidae', "simuliidae", "heptageniidae", "ephemerellidae")) %>% 
ggplot(aes(x = reorder(family, -total_n), y = normalized, fill = fish_spp)) +
  geom_col(position = position_dodge(width = 0.8)) +
  labs(
    x = "Family",
    y = "Percent Diet (0 - 1)",
    fill = "Species",
    title = "Families by Species - 2022"
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 50, hjust = 1),
    plot.title = element_text(face = "bold")
  )

ggsave("family_by_species_2022.png",
      width = 12,       # width in inches
       height = 8,       # taller height
       dpi = 300)        # high-res

Size (Age) Classes

###LCT Sizes Histogram
unique_fish %>% filter(fish_spp == "lct") %>% 
ggplot(aes(x = tail_length)) +
  geom_histogram(
    binwidth = 5,
    fill = "steelblue",
    color = "black"
  ) +
  labs(title = "LCT Size Classes") +
  theme_bw()

###BK Sizes Histogram
unique_fish %>% filter(fish_spp == "bk") %>% 
ggplot(aes(x = tail_length)) +
  geom_histogram(
    binwidth = 5,
    fill = "steelblue",
    color = "black"
  ) +
  labs(title = "BK Size Classes") +
  theme_bw()

###Fish Tail Length Boxplot
ggplot(unique_fish, aes(x = fish_spp, y = tail_length)) +
  geom_boxplot(fill = "lightblue", outlier.shape = NA) +  # boxplot
  geom_jitter(width = 0.2, size = 2, color = "darkblue", alpha = 0.7) +
  labs(
    title = "Total Length Distribution by Species - 2022",
    x = "Fish Species",
    y = "Total Length"
  ) +
  theme_bw() +
  labs(title = "Fish Tail Length") +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13))

###Fish Weight Boxplot
ggplot(unique_fish, aes(x = fish_spp, y = grams)) +
  geom_boxplot(fill = "lightblue", outlier.shape = NA) +  # boxplot
  geom_jitter(width = 0.2, size = 2, color = "darkblue", alpha = 0.7) +
  labs(
    title = "Total Mass Distribution - 2022",
    subtitle = "Some high outliers have been cutoff by y axis",
    x = "Fish Species",
    y = "Weight (g)"
  ) +
  coord_cartesian(ylim = c(0, 100)) +   # adjust upper limit
  theme_bw() +
  labs(title = "Fish Weight") +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 13))
Warning: Removed 25 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 25 rows containing missing values or values outside the scale range
(`geom_point()`).

#General Numerical Fish Relationships
unique_fish_bk_1 <- unique_fish %>% 
  filter(fish_spp == "bk") %>% 
  select(total_count, tail_length, digestion_code, grams)
plot(unique_fish_bk_1, main = "BK FISH METRICS 2022")

unique_fish_lct_1 <- unique_fish %>% 
  filter(fish_spp == "lct") %>% 
  select(total_count, tail_length, digestion_code, grams)
plot(unique_fish_lct_1, main = "LCT FISH METRICS 2022")

###BK YOY Diet Differences
unique_fish <- unique_fish %>% 
  mutate(
    yoy = if_else(
      fish_spp == "bk" & tail_length < 100,
      1,
      0
    )) %>% 
  mutate(yoy = ifelse(
    fish_spp == "lct", NA, yoy
  ))

###Seperating out counts by YOY by Family by Order
diversity_within_yoy <- sd_clean %>% 
    filter(fish_spp == "bk",
           year == "2022") %>% 
    mutate(
    yoy = if_else(
      fish_spp == "bk" & fish_tl_mm < 100,
      1,
      0
    )) %>% 
  filter(year == "2022") %>% 
  group_by(fish_spp, family, yoy) %>%
  summarize(n = sum(n, na.rm = TRUE), .groups = "drop") %>%
  group_by(family) %>%
  mutate(total_n = sum(n)) %>%
  mutate(normalized = ifelse(fish_spp == "bk", n / total_n, NA)) %>% 
  ungroup()

unique_fish %>% 
  filter(fish_spp == "bk") %>% 
  summary()
      id             total_count       fish_spp          tail_length   
 Length:32          Min.   :  0.00   Length:32          Min.   : 64.0  
 Class :character   1st Qu.: 15.50   Class :character   1st Qu.: 76.0  
 Mode  :character   Median : 42.00   Mode  :character   Median :144.5  
                    Mean   : 86.59                      Mean   :131.8  
                    3rd Qu.:130.50                      3rd Qu.:176.0  
                    Max.   :517.00                      Max.   :200.0  
                                                                       
      year      digestion_code      grams            yoy        
 Min.   :2022   Min.   :1.000   Min.   : 2.00   Min.   :0.0000  
 1st Qu.:2022   1st Qu.:2.750   1st Qu.: 4.00   1st Qu.:0.0000  
 Median :2022   Median :3.000   Median :28.50   Median :0.0000  
 Mean   :2022   Mean   :3.032   Mean   :33.09   Mean   :0.4062  
 3rd Qu.:2022   3rd Qu.:3.500   3rd Qu.:56.50   3rd Qu.:1.0000  
 Max.   :2022   Max.   :4.000   Max.   :86.00   Max.   :1.0000  
                NA's   :1                                       
unique_fish_bk <- unique_fish %>% 
  filter(fish_spp == "bk") 

ggplot(diversity_within_yoy, aes(x = reorder(family, -n), y = n)) +
  facet_wrap(~yoy) +
  geom_col(fill = "steelblue") +
  labs(
    title = "YOY vs Old BK Feeding",
    x = "Family",
    y = "Count"
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 50, hjust = 1),
    plot.title = element_text(face = "bold")
  )