knitr::opts_chunk$set(
echo = TRUE, # Display code chunks
eval = TRUE, # Evaluate code chunks
warning = FALSE, # Hide warnings
message = FALSE, # Hide messages
fig.width = 20, # Set plot width in inches
fig.height = 9, # Set plot height in inches
fig.align = "center" # Align plots to the center
)
library(tinytex)
library(tidyr)
library(tidyverse)
library(vegan)
Weight = mg
Length, width, height = mm
p450, SOD = activity/ (mg/protein)
Condition factor, economic factor = unitless
getwd()
## [1] "/Users/cmantegna/Documents/WDFWmussels/code"
#data has all sites, coordinates, p450, sod, condition factor, economic factor data
data<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/biomarkerfull.csv")
#alldata has the site names, biomarkers, condition factor, average thickness and analyte data - each row is an individual sample
alldata<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/alldata.csv")
# Data contains 0's and must be adjusted in this order to preserve all usable data.
#sod
#replace any SOD values at or below 0 with half of the lower detection limit of .005 (.005*.5). Lower detection limit determined by assay protocol by the manufacturer, Cayman.
data$SOD[data$SOD <= 0] <- 0.0025
alldata$SOD[alldata$SOD <= 0] <- 0.0025
#p450
#remove any p450 values that are 0 - those are true 0's not non-detectable. I am replacing with na so I don't lose the entire row of data, including the SOD values.
data$p450[data$p450 <= 0] <- NA
alldata$p450[alldata$p450 <= 0] <- NA
#write.csv(alldata, "/Users/cmantegna/Documents/WDFWmussels/data/alldata.csv")
#Impute missing p450 and condition_factor values ### Using the ‘mice’ package, the most common imputation method is predictive mean matching (pmm) based on the package documentation, so that is used here.
# check the data frame
#summary(alldata)
#str(alldata)
library(dplyr)
# Prepare the data for clustering - only numeric columns should be included
clustering_data <- alldata %>%
select(p450, SOD, condition_factor, avg_thickness) %>%
na.omit() # Removing rows with NA values
# Scale the data
clustering_data <- scale(clustering_data)
# Perform K-means clustering with a chosen number of clusters, let's say 3 for this example
set.seed(123) # Setting seed for reproducibility
kmeans_result <- kmeans(clustering_data, centers = 3)
# Add the cluster assignments to the original data
alldata$cluster <- kmeans_result$cluster[match(rownames(clustering_data), rownames(alldata))]
# Now let's list the unique site names within each cluster
clustered_sites <- alldata %>%
select(site_name, cluster) %>%
group_by(cluster) %>%
summarise(unique_sites = list(unique(site_name))) # Using list to keep it all within one dataframe
print(clustered_sites)
## # A tibble: 3 × 2
## cluster unique_sites
## <int> <list>
## 1 1 <chr [11]>
## 2 2 <chr [64]>
## 3 3 <chr [60]>
# Expanding the list into separate rows
clustered_sites_expanded <- clustered_sites %>%
unnest(unique_sites)
#write to CSV
#write.csv(clustered_sites_expanded, "/Users/cmantegna/Documents/WDFWmussels/output/clustered_sites.csv", row.names = FALSE)
library(dplyr)
# Selecting and scaling the columns (standardize them because K-means is affected by scale)
data_selected <- alldata %>%
select(p450, SOD, condition_factor, avg_thickness) %>%
na.omit() %>% # Removing any NA values to avoid errors in kmeans
scale() # Scale the data to standardize
# Perform K-means clustering, let's say we determine that there are 3 clusters
set.seed(123) # For reproducibility
kmeans_result <- kmeans(data_selected, centers = 3)
# The centroids (cluster centers) are stored in the 'centers' attribute
centroids <- kmeans_result$centers
# Print the centroids
print(centroids)
## p450 SOD condition_factor avg_thickness
## 1 -0.3405493 3.67357345 -0.15906742 0.1353372
## 2 0.1929275 -0.23764645 0.06870675 0.6953590
## 3 -0.2136235 -0.05978512 -0.07232716 -0.9051885
library(dplyr)
library(pgirmess)
# Perform Kruskal-Wallis test for each variable
kw_p450 <- kruskal.test(p450 ~ cluster, data = alldata)
kw_SOD <- kruskal.test(SOD ~ cluster, data = alldata)
kw_condition_factor <- kruskal.test(condition_factor ~ cluster, data = alldata)
kw_avg_thickness <- kruskal.test(avg_thickness ~ cluster, data = alldata)
# Print the Kruskal-Wallis test results
print(kw_p450)
##
## Kruskal-Wallis rank sum test
##
## data: p450 by cluster
## Kruskal-Wallis chi-squared = 31.692, df = 2, p-value = 1.312e-07
print(kw_SOD)
##
## Kruskal-Wallis rank sum test
##
## data: SOD by cluster
## Kruskal-Wallis chi-squared = 41.443, df = 2, p-value = 1.002e-09
print(kw_condition_factor)
##
## Kruskal-Wallis rank sum test
##
## data: condition_factor by cluster
## Kruskal-Wallis chi-squared = 6.2766, df = 2, p-value = 0.04336
print(kw_avg_thickness)
##
## Kruskal-Wallis rank sum test
##
## data: avg_thickness by cluster
## Kruskal-Wallis chi-squared = 215.67, df = 2, p-value < 2.2e-16
#p450
if(kw_p450$p.value < 0.05) {
posthoc_p450 <- kruskalmc(p450 ~ cluster, data = alldata)
print(posthoc_p450)
}
## Multiple comparison test after Kruskal-Wallis
## alpha: 0.05
## Comparisons
## obs.dif critical.dif stat.signif
## 1-2 85.73947 62.17164 TRUE
## 1-3 32.39518 62.79907 FALSE
## 2-3 53.34428 25.17240 TRUE
#SOD
if(kw_SOD$p.value < 0.05) {
posthoc_SOD <- kruskalmc(SOD ~ cluster, data = alldata)
print(posthoc_SOD)
}
## Multiple comparison test after Kruskal-Wallis
## alpha: 0.05
## Comparisons
## obs.dif critical.dif stat.signif
## 1-2 165.19689 62.17164 TRUE
## 1-3 144.02231 62.79907 TRUE
## 2-3 21.17457 25.17240 FALSE
#condition_factor
if(kw_condition_factor$p.value < 0.05) {
posthoc_condition_factor <- kruskalmc(condition_factor ~ cluster, data = alldata)
print(posthoc_condition_factor)
}
## Multiple comparison test after Kruskal-Wallis
## alpha: 0.05
## Comparisons
## obs.dif critical.dif stat.signif
## 1-2 18.377289 62.17164 FALSE
## 1-3 7.812096 62.79907 FALSE
## 2-3 26.189386 25.17240 TRUE
#avg_thickness
if(kw_avg_thickness$p.value < 0.05) {
posthoc_avg_thickness <- kruskalmc(avg_thickness ~ cluster, data = alldata)
print(posthoc_avg_thickness)
}
## Multiple comparison test after Kruskal-Wallis
## alpha: 0.05
## Comparisons
## obs.dif critical.dif stat.signif
## 1-2 55.37042 62.17164 FALSE
## 1-3 98.94833 62.79907 TRUE
## 2-3 154.31875 25.17240 TRUE