Zooplankton Lab

Alexis Provencal

Objectives

Tidy messy dataframe so all variables are columns
Join plankton and water quality dataframes together
Produce a bar graph with the results of a means test of the abundance of each of the plankton groups
Produce a bar graph with the results of a means test of species richness by “station”
Produce a scatterplot graph with the results of a correlation test between abundance and salinity
Produce a scatterplot graph with the results of a correlation test between species richness and salinity

Loading my functions

ggdynamiteplot makes means plots with error bars for sd or se. It makes summary statistics inside the function, making dynamite plots easier and faster to produce. theme_manuscript is a themeing function which alters the appearance of the figures to meet the requirements of the professor.

library(ggplot2)

ggdynamiteplot <- function(data, x, y, error = c("sd", "se"), width = 0.5) {
  if (!(error %in% c("sd", "se"))) {
    stop("Error: Error parameter must be either 'sd' or 'se'")
  }
  if (error == "sd") {
    summary_stats <- data %>%
      dplyr::group_by({{x}}) %>%
      dplyr::summarize(
        mean_y = mean({{y}}, na.rm = TRUE),
        error_y = sd({{y}}, na.rm = TRUE)
      ) %>%
      arrange(desc(mean_y))
  } else if (error == "se") {
    summary_stats <- data %>%
      dplyr::group_by({{x}}) %>%
      dplyr::summarize(
        mean_y = mean({{y}}, na.rm = TRUE),
        error_y = sd({{y}}, na.rm = TRUE) / sqrt(dplyr::n())
      ) %>%
      arrange(desc(mean_y))
  }
  
  summary_stats <- summary_stats %>%
    mutate({{x}} := factor({{x}}, levels = {{x}}[order(mean_y, decreasing = TRUE)]))

  plot <- ggplot2::ggplot(summary_stats, ggplot2::aes(x = {{x}}, y = mean_y)) +
    ggplot2::geom_bar(
      stat = "identity",
      position = "identity",
      fill = "gray70",
      width = width
    ) +
    ggplot2::geom_errorbar(ggplot2::aes(ymin = mean_y - error_y, ymax = mean_y + error_y),
                           width = 0.2, color = "black", linewidth = 0.5) +
    ggplot2::scale_y_continuous(expand = c(0,0)) +
    ggplot2::labs(y = as_label(ensym(y)))

  return(plot)

}

theme_manuscript = 
  function(font = NA, base_size = 12){
  theme(
    #Text font
    text = element_text(family = font, size = base_size),
    
    #Color
    line = element_line(color = "black"),
    panel.background = element_rect(fill = "white"), #fill plot background
    legend.key = element_rect(fill = "white"), #fill background of legend key
    axis.line = element_line(color = "black"), #color axes lines
    
    #Positioning
    legend.position = "right", #change legend position
      #options: none, left, right, bottom, top
    
    #Text sizes and options
    plot.title = element_text(
      size = base_size*1.3, #title size
      face = "plain"), #options: plain, italic, bold, bold.italic
    axis.title = element_text(size = base_size), #axis titles text size
    axis.text = element_text(size = base_size, color = "black"), #axis labels/tick numbers text size
    legend.text = element_text(size = base_size), #legend text size
    legend.title = element_text(size = base_size) #legend title text size
  )
}

Importing data

These data are results from zooplankton counts we did in lab under dissection microscope. When the plankton were collected, water quality data were also collected for each water sample (temperature, salinity, and DO). The objectives of this lab were to identify differences in abundance among plankton groups and among stations. Additionally, to see if there was a correlation between salinity and species abundance or richness.

df_raw = read.csv("plankton1.csv", strip.white = TRUE, check.names = FALSE)
glimpse(df_raw)

## Rows: 27
## Columns: 26
## $ Station                     <chr> "OUT", "OUT", "OUT", "OUT", "OUT", "OUT", …
## $ Sample                      <int> 1, 1, 1, 7, 7, 7, 13, 13, 13, 5, 5, 5, 9, …
## $ Chaetognath                 <int> 0, 1, 5, 0, 0, 0, 1, 4, 3, 8, 77, 161, 0, …
## $ Copepod                     <int> 141, 99, 133, 169, 305, 452, 214, 208, 272…
## $ `Cope. Nauplii`             <int> 0, 0, 1, 1, 0, 0, 3, 0, 4, 10, 7, 8, 8, 3,…
## $ Lucifer                     <int> 1, 0, 0, 2, 2, 3, 8, 17, 3, 2, 2, 8, 3, 5,…
## $ Mysid                       <int> 0, 0, 0, 3, 5, 11, 79, 29, 19, 1, 4, 7, 20…
## $ `Crab Zoea`                 <int> 1, 1, 0, 0, 2, 2, 3, 17, 59, 2, 5, 4, 3, 8…
## $ `Crab Megalops`             <int> 1, 0, 0, 4, 4, 17, 16, 26, 30, 0, 1, 0, 1,…
## $ `Fish larvae`               <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Eggs                        <int> 0, 0, 4, 18, 33, 50, 6, 10, 9, 8, 26, 9, 4…
## $ `Unknown Gastropod Larvae`  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ `Unknown Cephalopod Larvae` <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Amphipods                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Cnidarian Larvae`          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Sacphopod                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Salp                        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Segmented Worm Larvae`     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ Ostrocod                    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 47, 0, …
## $ Isopod                      <int> 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ `Brittle star`              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Polychaete larvae`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Laravcean                   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ `Unknown Crustcean`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Jelly                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 25, 41, 0, …
## $ Snail                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

wq = read.csv("plankton_wq.csv", strip.white = TRUE)
wq <- dplyr::select(wq, -Station)
head(wq)

##   Sample Temperature Salinity   DO
## 1      1        24.3     35.3 6.74
## 2      7        24.4     35.4 6.73
## 3     13        24.1     35.5 6.71
## 4      5        23.6     30.8 6.78
## 5      9        23.7     30.9 6.82
## 6     15        23.9     30.8 6.86

Tidy data

To make this data tidy, each variable needs to be a column. So, there needs to be one column for Plankton Group.

library(tidyr)
library(dplyr)

df <- df_raw %>%
  tidyr::pivot_longer(
    cols = "Chaetognath":"Snail",
    names_to = "Plankton_Group",
    values_to = "Count"
  ) %>%
  dplyr::group_by(Station, Sample, Plankton_Group) %>%
  dplyr::summarize(Total_Count = sum(Count))

## `summarise()` has grouped output by 'Station', 'Sample'. You can override using
## the `.groups` argument.

df$Plankton_Group <- as.factor(df$Plankton_Group)

head(df)

## # A tibble: 6 × 4
## # Groups:   Station, Sample [1]
##   Station Sample Plankton_Group   Total_Count
##   <chr>    <int> <fct>                  <int>
## 1 MID          5 Amphipods                  0
## 2 MID          5 Brittle star               0
## 3 MID          5 Chaetognath              246
## 4 MID          5 Cnidarian Larvae           0
## 5 MID          5 Cope. Nauplii             25
## 6 MID          5 Copepod                  235

Part A

A bar graph with the results of a means test of the abundance of each of the plankton groups

ggdynamiteplot(df, Plankton_Group, Total_Count, error = "se", width = 0.25) +
  labs(x = "Plankton Group", y = "Count") +
  theme_manuscript(font = "Times", base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

shapiro.test(df$Total_Count)

## 
##  Shapiro-Wilk normality test
## 
## data:  df$Total_Count
## W = 0.28231, p-value < 2.2e-16

kruskal.test(Total_Count ~ Plankton_Group, data = df)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Total_Count by Plankton_Group
## Kruskal-Wallis chi-squared = 149.31, df = 23, p-value < 2.2e-16

# Perform post-hoc Dunn's test
library(FSA)
library(rcompanion)
Phocdunn <- dunnTest(Total_Count ~ Plankton_Group, data = df, method="bonferroni")

Phocdunns <- Phocdunn$res

cld <- cldList(comparison = Phocdunns$Comparison,
        p.value    = Phocdunns$P.adj,
        threshold  = 0.05)[1:2]

names(cld)[1]<-"Plankton_Group" # change the name of grouping factor according to the dataset (df)
cld

##             Plankton_Group Letter
## 1                Amphipods      a
## 2              Brittlestar     ab
## 3              Chaetognath   abcd
## 4          CnidarianLarvae     ab
## 5             Cope.Nauplii   abcd
## 6                  Copepod      c
## 7             CrabMegalops   abcd
## 8                 CrabZoea     cd
## 9                     Eggs     cd
## 10              Fishlarvae    abd
## 11                  Isopod    abd
## 12                   Jelly    abd
## 13               Laravcean     ab
## 14                 Lucifer    bcd
## 15                   Mysid    bcd
## 16                Ostrocod    abd
## 17        Polychaetelarvae    abd
## 18               Sacphopod      a
## 19                    Salp      a
## 20     SegmentedWormLarvae     ab
## 21                   Snail     ab
## 22 UnknownCephalopodLarvae      a
## 23        UnknownCrustcean     ab
## 24  UnknownGastropodLarvae    abd

Part B

A bar graph with the results of a means test of species richness by “station”

species_richness <- df %>%
  dplyr::group_by(Station, Sample) %>%
  dplyr::filter(Total_Count != 0) %>%
  dplyr::summarize(Species_count = n_distinct(Plankton_Group))

## `summarise()` has grouped output by 'Station'. You can override using the
## `.groups` argument.

head(species_richness)

## # A tibble: 6 × 3
## # Groups:   Station [2]
##   Station Sample Species_count
##   <chr>    <int>         <int>
## 1 MID          5            10
## 2 MID          9            12
## 3 MID         15            14
## 4 OUT          1             8
## 5 OUT          7             8
## 6 OUT         13             8

ggdynamiteplot(species_richness, Station, Species_count, error = "se") +
  labs(x = "Station", y = "Species Richness") +
  theme_manuscript(font = "Times")

shapiro.test(species_richness$Species_count)

## 
##  Shapiro-Wilk normality test
## 
## data:  species_richness$Species_count
## W = 0.89577, p-value = 0.2285

library(agricolae)
model <- aov(Species_count ~ Station, data = species_richness)
summary(model)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## Station      2     24  12.000     7.2 0.0254 *
## Residuals    6     10   1.667                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

tukey_result <- HSD.test(model, trt = "Station")
print(tukey_result$groups)

##     Species_count groups
## MID            12      a
## WAI            10     ab
## OUT             8      b

Part C

A scatterplot graph with the results of a correlation test between abundance and salinity

df_C <- df %>%
  dplyr::group_by(Station, Sample) %>%
  dplyr::summarize(Total_abund = sum(Total_Count)) %>%
  dplyr::left_join(y = wq, by = "Sample")

## `summarise()` has grouped output by 'Station'. You can override using the
## `.groups` argument.

ggplot(
  data = df_C,
  mapping = aes(x = Salinity, y = Total_abund)
  ) +
  geom_point() +
  scale_y_continuous(limits = c(0,1200))+
  scale_x_continuous(limits = c(30,36), 
                     breaks = seq(30,36, by = 2),
                     expand = c(0, 0)) +
  labs(x = "Salinity", y = "Abundance") +
  theme_manuscript(font = "Times", base_size = 11)

cor.test(df_C$Salinity, df_C$Total_abund, method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  df_C$Salinity and df_C$Total_abund
## S = 50.711, p-value = 0.1035
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5774109

Part D

A scatterplot graph with the results of a correlation test between species richness and salinity

df_D <- df %>%
  dplyr::group_by(Station, Sample) %>%
  dplyr::filter(Total_Count != 0) %>%
  dplyr::summarize(Species_count = n_distinct(Plankton_Group)) %>%
  dplyr::left_join(y = wq, by = "Sample")

## `summarise()` has grouped output by 'Station'. You can override using the
## `.groups` argument.

ggplot(
  data = df_D,
  mapping = aes(x = Salinity, y = Species_count)
  ) +
  geom_point() +
  scale_x_continuous(limits = c(30,36), 
                     breaks = seq(30,36, by = 2),
                     expand = c(0, 0)) +
  labs(x = "Salinity", y = "Species Richness") +
  theme_manuscript(font = "Times", base_size = 11)

cor.test(df_D$Salinity, df_D$Species_count, method = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  df_D$Salinity and df_D$Species_count
## S = 216.94, p-value = 0.008444
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.8078103