Directory and doc rules

knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = TRUE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 20,       # Set plot width in inches
  fig.height = 9,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)

Load packages

library(tidyr)
library(tidyverse)
library(vegan)

Load data

Weight = mg
Length, width, height = mm
p450, SOD = activity/ (mg/protein)
Condition factor, economic factor = unitless

getwd()
## [1] "/Users/cmantegna/Documents/WDFWmussels/code"
#data has all sites, coordinates, p450, sod, condition factor, economic factor data
data<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/biomarkerfull.csv")

#alldata has the site names, biomarkers, condition factor, average thickness and analyte data - each row is an individual sample
alldata<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/alldata.csv")

fix zero’s in the data frame and alldata frame

# Data contains 0's and must be adjusted in this order to preserve all usable data.

#sod
#replace any SOD values at or below 0 with half of the lower detection limit of .005 (.005*.5). Lower detection limit determined by assay protocol by the manufacturer, Cayman.
data$SOD[data$SOD <= 0] <- 0.0025
alldata$SOD[alldata$SOD <= 0] <- 0.0025
#p450
#remove any p450 values that are 0 - those are true 0's not non-detectable. I am replacing with na so I don't lose the entire row of data, including the SOD values.
data$p450[data$p450 <= 0] <- NA
alldata$p450[alldata$p450 <= 0] <- NA

#write.csv(alldata, "/Users/cmantegna/Documents/WDFWmussels/data/alldata.csv")
# check the data frame
#summary(alldata)
#str(alldata)

Spearman’s Correlation, all sites, biomarkers, condition_factor and avg_thickness

SC is read from -1 to 1. Values of either -1 or 1 indicate perfect correlations, 0 indicates no correlations and all other values indicate some correlation but that value must be viewed in relation to the associated p-value.

library(dplyr)
library(ggplot2)


# Ensure the columns are numeric by converting them explicitly if necessary
data$p450 <- as.numeric(data$p450)
data$SOD <- as.numeric(data$SOD)
data$condition_factor <- as.numeric(data$condition_factor)
data$avg_thickness <- as.numeric(data$avg_thickness)
data$site_number <- as.numeric(data$site_number)

# Fill missing values with median for 'p450' and 'condition_factor'
data$p450[is.na(data$p450)] <- median(data$p450, na.rm = TRUE)
data$condition_factor[is.na(data$condition_factor)] <- median(data$condition_factor, na.rm = TRUE)

# Calculate Spearman's rank correlation matrix for selected variables
selected_vars <- data %>% select(site_number, p450, SOD, condition_factor, avg_thickness)
correlation_matrix <- cor(selected_vars, method = "spearman")
print(correlation_matrix)
##                  site_number        p450         SOD condition_factor
## site_number       1.00000000  0.09682605 -0.09801992      -0.22554332
## p450              0.09682605  1.00000000 -0.23212416       0.02678856
## SOD              -0.09801992 -0.23212416  1.00000000      -0.08788349
## condition_factor -0.22554332  0.02678856 -0.08788349       1.00000000
## avg_thickness     0.12109155  0.19029387 -0.07075347       0.13921086
##                  avg_thickness
## site_number         0.12109155
## p450                0.19029387
## SOD                -0.07075347
## condition_factor    0.13921086
## avg_thickness       1.00000000
#write.csv(correlation_matrix, "/Users/cmantegna/Documents/WDFWmussels/output/tables/all_corr.csv")

Spearman’s Correlation by region, biomarkers, condition_factor and avg_thickness

library(dplyr)

# Define regions based on latitude
data$region <- case_when(
  data$latitude >= 47.78184 & data$latitude <= 48.8208 ~ "North",
  data$latitude >= 47.38566 & data$latitude <= 47.7296079 ~ "Central",
  data$latitude >= 47.052362 & data$latitude <= 47.3539722 ~ "South",
  TRUE ~ NA_character_
)

# Filter out rows without a region designation (if any)
data <- data %>% filter(!is.na(region))

# Function to calculate and format Spearman correlation as a data frame
calculate_correlation <- function(data, region_name) {
  corr <- cor(data %>% select(site_number, p450, SOD, condition_factor, avg_thickness), 
              method = "spearman")
  corr_df <- as.data.frame(as.table(corr))
  names(corr_df) <- c("Variable1", "Variable2", "Correlation")
  corr_df$Region <- region_name
  return(corr_df)
}

# Calculate correlations for each region and bind them into a single data frame
north_corr <- calculate_correlation(data %>% filter(region == "North"), "North")
central_corr <- calculate_correlation(data %>% filter(region == "Central"), "Central")
south_corr <- calculate_correlation(data %>% filter(region == "South"), "South")

all_corr <- bind_rows(north_corr, central_corr, south_corr)

# Print the consolidated correlation data frame
head(all_corr)
##          Variable1   Variable2 Correlation Region
## 1      site_number site_number  1.00000000  North
## 2             p450 site_number  0.08244169  North
## 3              SOD site_number -0.18383185  North
## 4 condition_factor site_number -0.32105058  North
## 5    avg_thickness site_number  0.19111321  North
## 6      site_number        p450  0.08244169  North
#write.csv(all_corr, "/Users/cmantegna/Documents/WDFWmussels/output/tables/region_corr.csv")

List of sites included in each region (North, Central, South) based on latitude

library(dplyr)

# Split data into North, Central, and South regions based on latitude
data$region <- case_when(
  data$latitude >= 47.78184 & data$latitude <= 48.8208 ~ "North",
  data$latitude >= 47.38566 & data$latitude <= 47.7296079 ~ "Central",
  data$latitude >= 47.052362 & data$latitude <= 47.3539722 ~ "South",
  TRUE ~ NA_character_
)

# Filter out rows without a region designation (if any)
data <- data %>% filter(!is.na(region))

# Create a data frame listing sites by region
sites_by_region <- data %>% 
  select(region, site_name) %>% 
  distinct() %>%
  arrange(region, site_name)  # Optional: sort by region and site_name

# Print the data frame
head(sites_by_region)
##    region                           site_name
## 1 Central                        Arroyo Beach
## 2 Central                      Brackenwood Ln
## 3 Central                   Des Moines Marina
## 4 Central                     Eagle Harbor Dr
## 5 Central           Elliot Bay Myrtle Edwards
## 6 Central Elliott Bay, Harbor Island, Pier 17
#write.csv(sites_by_region,"/Users/cmantegna/Documents/WDFWmussels/data/regions.csv" )
library(dplyr)

# Convert all relevant variables to numeric and filter out any NAs
data <- data %>%
  mutate(
    p450 = as.numeric(p450),
    SOD = as.numeric(SOD),
    condition_factor = as.numeric(condition_factor),
    avg_thickness = as.numeric(avg_thickness),
    site_number = as.numeric(site_number),
    region = case_when(
      latitude >= 47.78184 & latitude <= 48.8208 ~ "North",
      latitude >= 47.38566 & latitude <= 47.7296079 ~ "Central",
      latitude >= 47.052362 & latitude <= 47.3539722 ~ "South",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(region)) %>%
  drop_na(p450, SOD, condition_factor, avg_thickness, site_number) # Drops rows with NA in any of these columns

# Define variables for correlation
variables <- c("site_number", "p450", "SOD", "condition_factor", "avg_thickness")

# Calculate Spearman's rank correlation matrix and p-values
get_correlation_and_pvalues <- function(df, region) {
  # Calculate correlation matrix
  corr_matrix <- cor(df[variables], method = "spearman")
  
  # Prepare to store p-values
  p_values <- matrix(nrow = length(variables), ncol = length(variables), dimnames = list(variables, variables))
  
  # Calculate p-values
  for (i in seq_along(variables)) {
    for (j in seq_along(variables)) {
      if (i != j) {
        test_result <- cor.test(df[[variables[i]]], df[[variables[j]]], method = "spearman")
        p_values[i, j] <- test_result$p.value
      } else {
        p_values[i, j] <- NA # Diagonal elements are not applicable
      }
    }
  }
  
  list(correlation = corr_matrix, p_values = p_values, region = region)
}

# Apply the function to each region
north_results <- get_correlation_and_pvalues(data %>% filter(region == "North"), "North")
central_results <- get_correlation_and_pvalues(data %>% filter(region == "Central"), "Central")
south_results <- get_correlation_and_pvalues(data %>% filter(region == "South"), "South")

head(north_results)
## $correlation
##                  site_number        p450         SOD condition_factor
## site_number       1.00000000  0.08244169 -0.18383185      -0.32105058
## p450              0.08244169  1.00000000 -0.26279653       0.07646417
## SOD              -0.18383185 -0.26279653  1.00000000      -0.04809044
## condition_factor -0.32105058  0.07646417 -0.04809044       1.00000000
## avg_thickness     0.19111321  0.36623867 -0.40940032       0.17280803
##                  avg_thickness
## site_number          0.1911132
## p450                 0.3662387
## SOD                 -0.4094003
## condition_factor     0.1728080
## avg_thickness        1.0000000
## 
## $p_values
##                   site_number         p450          SOD condition_factor
## site_number                NA 3.853384e-01 5.128632e-02     0.0005260579
## p450             0.3853384288           NA 4.921766e-03     0.4208386358
## SOD              0.0512863219 4.921766e-03           NA     0.6129849311
## condition_factor 0.0005260579 4.208386e-01 6.129849e-01               NA
## avg_thickness    0.0425905506 6.623795e-05 6.713738e-06     0.0671982559
##                  avg_thickness
## site_number       4.259055e-02
## p450              6.623795e-05
## SOD               6.713738e-06
## condition_factor  6.719826e-02
## avg_thickness               NA
## 
## $region
## [1] "North"
#write.csv(north_results,"/Users/cmantegna/Documents/WDFWmussels/output/tables/north_spearman.csv" )

head(central_results)
## $correlation
##                   site_number        p450         SOD condition_factor
## site_number       1.000000000 -0.07224937 -0.07368597      -0.02921879
## p450             -0.072249365  1.00000000 -0.21051421      -0.04389451
## SOD              -0.073685971 -0.21051421  1.00000000      -0.13044439
## condition_factor -0.029218787 -0.04389451 -0.13044439       1.00000000
## avg_thickness    -0.005281988  0.08280128  0.15217392       0.14770352
##                  avg_thickness
## site_number       -0.005281988
## p450               0.082801278
## SOD                0.152173917
## condition_factor   0.147703525
## avg_thickness      1.000000000
## 
## $p_values
##                  site_number       p450        SOD condition_factor
## site_number               NA 0.46390525 0.45504274        0.7673197
## p450               0.4639053         NA 0.03112088        0.6565979
## SOD                0.4550427 0.03112088         NA        0.1847299
## condition_factor   0.7673197 0.65659789 0.18472988               NA
## avg_thickness      0.9573521 0.40105002 0.12121525        0.1326706
##                  avg_thickness
## site_number          0.9573521
## p450                 0.4010500
## SOD                  0.1212153
## condition_factor     0.1326706
## avg_thickness               NA
## 
## $region
## [1] "Central"
#write.csv(central_results,"/Users/cmantegna/Documents/WDFWmussels/output/tables/central_spearman.csv" )

head(south_results)
## $correlation
##                  site_number        p450         SOD condition_factor
## site_number       1.00000000  0.13309113 -0.02303667       0.10318273
## p450              0.13309113  1.00000000 -0.23856887       0.06372156
## SOD              -0.02303667 -0.23856887  1.00000000      -0.08617974
## condition_factor  0.10318273  0.06372156 -0.08617974       1.00000000
## avg_thickness    -0.01938553  0.03066833  0.09951509       0.11243735
##                  avg_thickness
## site_number        -0.01938553
## p450                0.03066833
## SOD                 0.09951509
## condition_factor    0.11243735
## avg_thickness       1.00000000
## 
## $p_values
##                  site_number       p450        SOD condition_factor
## site_number               NA 0.20096793 0.82556768        0.3223437
## p450               0.2009679         NA 0.02057981        0.5417573
## SOD                0.8255677 0.02057981         NA        0.4088596
## condition_factor   0.3223437 0.54175726 0.40885963               NA
## avg_thickness      0.8528743 0.76919264 0.33993349        0.2806053
##                  avg_thickness
## site_number          0.8528743
## p450                 0.7691926
## SOD                  0.3399335
## condition_factor     0.2806053
## avg_thickness               NA
## 
## $region
## [1] "South"
#write.csv(south_results,"/Users/cmantegna/Documents/WDFWmussels/output/tables/south_spearman.csv" )

Boxplots of p450 and SOD by region

# p450
pregion<- ggplot(data, aes(x = region, y = p450)) + 
  geom_boxplot() + 
  labs(title = "p450 Levels by Region", x = "Region", y = "p450 Values")

print(pregion)

#ggsave(plot=pregion, filename="/Users/cmantegna/Documents/WDFWmussels/output/figures/p450region.png", width=15, height=8)

# SOD
sregion<- ggplot(data, aes(x = region, y = SOD)) + 
  geom_boxplot() + 
  labs(title = "SOD Levels by Region", x = "Region", y = "SOD Values")

print(sregion)

#ggsave(plot=sregion, filename="/Users/cmantegna/Documents/WDFWmussels/output/figures/SODregion.png", width=15, height=8)