Directory and doc rules

knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = TRUE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 20,       # Set plot width in inches
  fig.height = 9,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)

Load packages

library(tinytex)
library(tidyr)
library(tidyverse)
library(vegan)

Load data

Weight = mg
Length, width, height = mm
p450, SOD = activity/ (mg/protein)
Condition factor, economic factor = unitless

getwd()
## [1] "/Users/cmantegna/Documents/WDFWmussels/code"
#data has all sites, coordinates, p450, sod, condition factor, economic factor data
data<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/biomarkerfull.csv")

#alldata has the site names, biomarkers, condition factor, average thickness and analyte data - each row is an individual sample
alldata<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/alldata.csv")

fix zero’s in the data frame and alldata frame

# Data contains 0's and must be adjusted in this order to preserve all usable data.

#sod
#replace any SOD values at or below 0 with half of the lower detection limit of .005 (.005*.5). Lower detection limit determined by assay protocol by the manufacturer, Cayman.
data$SOD[data$SOD <= 0] <- 0.0025
alldata$SOD[alldata$SOD <= 0] <- 0.0025
#p450
#remove any p450 values that are 0 - those are true 0's not non-detectable. I am replacing with na so I don't lose the entire row of data, including the SOD values.
data$p450[data$p450 <= 0] <- NA
alldata$p450[alldata$p450 <= 0] <- NA

#write.csv(alldata, "/Users/cmantegna/Documents/WDFWmussels/data/alldata.csv")

#Impute missing p450 and condition_factor values ### Using the ‘mice’ package, the most common imputation method is predictive mean matching (pmm) based on the package documentation, so that is used here.

# check the data frame
#summary(alldata)
#str(alldata)

K-means clustering

This spatial analysis test uses the complete data set to statistically determine clusters of difference in the data. Standard cluster choice is 3 or 4 (I chose 3 and not 4; 4 clusters yield a cluster of 1 site) clusters that are distinct from the other clusters based on averaging the data to find centroid values. This is a tool to help identify patterns that may not be visible in basic analyses.

library(dplyr)

# Prepare the data for clustering - only numeric columns should be included
clustering_data <- alldata %>%
  select(p450, SOD, condition_factor, avg_thickness) %>%
  na.omit()  # Removing rows with NA values

# Scale the data
clustering_data <- scale(clustering_data)

# Perform K-means clustering with a chosen number of clusters, let's say 3 for this example
set.seed(123)  # Setting seed for reproducibility
kmeans_result <- kmeans(clustering_data, centers = 3)

# Add the cluster assignments to the original data
alldata$cluster <- kmeans_result$cluster[match(rownames(clustering_data), rownames(alldata))]

# Now let's list the unique site names within each cluster
clustered_sites <- alldata %>%
  select(site_name, cluster) %>%
  group_by(cluster) %>%
  summarise(unique_sites = list(unique(site_name)))  # Using list to keep it all within one dataframe

print(clustered_sites)
## # A tibble: 3 × 2
##   cluster unique_sites
##     <int> <list>      
## 1       1 <chr [11]>  
## 2       2 <chr [64]>  
## 3       3 <chr [60]>
# Expanding the list into separate rows
clustered_sites_expanded <- clustered_sites %>%
  unnest(unique_sites)

#write to CSV
#write.csv(clustered_sites_expanded, "/Users/cmantegna/Documents/WDFWmussels/output/clustered_sites.csv", row.names = FALSE)

Defining the centroids that define the clusters.

these centroids define each cluster (1-3) and assign a scaled z-score that produces negative and positive values. Negative values indicate a centroid value below the mean, and a positive number indicates a centroid value above the mean.

Cluster 1 has p450 and condition_factor values below the mean, and SOD and avg_thickness values above the mean.

Cluster 2 has SOD values below the mean, and p450, condition_factor and avg_thickness values above the mean.

Cluster 3 has all four values below their means.

library(dplyr)

# Selecting and scaling the columns (standardize them because K-means is affected by scale)
data_selected <- alldata %>% 
  select(p450, SOD, condition_factor, avg_thickness) %>% 
  na.omit() %>%  # Removing any NA values to avoid errors in kmeans
  scale()  # Scale the data to standardize

# Perform K-means clustering, let's say we determine that there are 3 clusters
set.seed(123)  # For reproducibility
kmeans_result <- kmeans(data_selected, centers = 3)

# The centroids (cluster centers) are stored in the 'centers' attribute
centroids <- kmeans_result$centers

# Print the centroids
print(centroids)
##         p450         SOD condition_factor avg_thickness
## 1 -0.3405493  3.67357345      -0.15906742     0.1353372
## 2  0.1929275 -0.23764645       0.06870675     0.6953590
## 3 -0.2136235 -0.05978512      -0.07232716    -0.9051885

Kruskal-Wallis & kruskalmc post hoc tests to verify if there are any statistically significant differences between the clusters

K-W shows a significant relationship between p450, SOD, condition_factor and avg_thickness and clustered sites. A post hoc test on each of these metrics will determine if the relationship is truly significant.

library(dplyr)
library(pgirmess)

# Perform Kruskal-Wallis test for each variable
kw_p450 <- kruskal.test(p450 ~ cluster, data = alldata)
kw_SOD <- kruskal.test(SOD ~ cluster, data = alldata)
kw_condition_factor <- kruskal.test(condition_factor ~ cluster, data = alldata)
kw_avg_thickness <- kruskal.test(avg_thickness ~ cluster, data = alldata)

# Print the Kruskal-Wallis test results
print(kw_p450)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  p450 by cluster
## Kruskal-Wallis chi-squared = 31.692, df = 2, p-value = 1.312e-07
print(kw_SOD)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  SOD by cluster
## Kruskal-Wallis chi-squared = 41.443, df = 2, p-value = 1.002e-09
print(kw_condition_factor)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  condition_factor by cluster
## Kruskal-Wallis chi-squared = 6.2766, df = 2, p-value = 0.04336
print(kw_avg_thickness)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  avg_thickness by cluster
## Kruskal-Wallis chi-squared = 215.67, df = 2, p-value < 2.2e-16

p450 kruskalmc post hoc

p450 values show a significant difference between clusters 1-2 and 2-3 but not between 1-3.

#p450

if(kw_p450$p.value < 0.05) {
  posthoc_p450 <- kruskalmc(p450 ~ cluster, data = alldata)
  print(posthoc_p450)
}
## Multiple comparison test after Kruskal-Wallis 
## alpha: 0.05 
## Comparisons
##      obs.dif critical.dif stat.signif
## 1-2 85.73947     62.17164        TRUE
## 1-3 32.39518     62.79907       FALSE
## 2-3 53.34428     25.17240        TRUE

SOD kruskalmc post hoc

SOD shows a significant difference between clusters 1-2 and 1-3 but not between 2-3.

#SOD

if(kw_SOD$p.value < 0.05) {
  posthoc_SOD <- kruskalmc(SOD ~ cluster, data = alldata)
  print(posthoc_SOD)
}
## Multiple comparison test after Kruskal-Wallis 
## alpha: 0.05 
## Comparisons
##       obs.dif critical.dif stat.signif
## 1-2 165.19689     62.17164        TRUE
## 1-3 144.02231     62.79907        TRUE
## 2-3  21.17457     25.17240       FALSE

condition_factor kruskalmc post hoc

Condition_factor shows no statistically significant difference between the clusters

#condition_factor

if(kw_condition_factor$p.value < 0.05) {
  posthoc_condition_factor <- kruskalmc(condition_factor ~ cluster, data = alldata)
  print(posthoc_condition_factor)
}
## Multiple comparison test after Kruskal-Wallis 
## alpha: 0.05 
## Comparisons
##       obs.dif critical.dif stat.signif
## 1-2 18.377289     62.17164       FALSE
## 1-3  7.812096     62.79907       FALSE
## 2-3 26.189386     25.17240        TRUE

avg_thickness kruskalmc post hoc

Avg_thickness shows a significant difference between clusters 1-3 and 2-3 but not between 1-2.

#avg_thickness

if(kw_avg_thickness$p.value < 0.05) {
  posthoc_avg_thickness <- kruskalmc(avg_thickness ~ cluster, data = alldata)
  print(posthoc_avg_thickness)
}
## Multiple comparison test after Kruskal-Wallis 
## alpha: 0.05 
## Comparisons
##       obs.dif critical.dif stat.signif
## 1-2  55.37042     62.17164       FALSE
## 1-3  98.94833     62.79907        TRUE
## 2-3 154.31875     25.17240        TRUE