Directory and doc rules

knitr::opts_chunk$set(
  echo = TRUE,         # Display code chunks
  eval = TRUE,         # Evaluate code chunks
  warning = FALSE,     # Hide warnings
  message = FALSE,     # Hide messages
  fig.width = 8,       # Set plot width in inches
  fig.height = 5,      # Set plot height in inches
  fig.align = "center" # Align plots to the center
)

Load packages

library(tidyr)
library(tidyverse)
library(ggplot2)
library(vegan)

Load data

getwd()

## [1] "/Users/cmantegna/Documents/WDFWmussels/code"

#data has all sites, coordinates, p450, sod, condition factor, economic factor data
mdata<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/biomarkerfull.csv")

Review setup

str(mdata)

## 'data.frame':    312 obs. of  16 variables:
##  $ latitude        : num  48.7 48.7 48.7 48.7 47.5 ...
##  $ longitude       : num  -123 -123 -123 -123 -122 ...
##  $ site_name       : chr  "Aiston Preserve" "Aiston Preserve" "Aiston Preserve" "Aiston Preserve" ...
##  $ site_number     : int  77 77 77 77 13 13 13 13 13 13 ...
##  $ sample          : int  239 240 241 242 281 282 283 284 285 286 ...
##  $ p450            : int  5965780 1508156 4674882 2861653 3448794 6485447 3563340 1813227 1987132 9587219 ...
##  $ SOD             : num  0 4.88 8.87 0.01 7.08 ...
##  $ weight_initial  : chr  "11.6884" "10.833" "14.7041" "14.6121" ...
##  $ length          : chr  "53.9" "53.49" "55.99" "58.55" ...
##  $ width           : chr  "22.73" "23.92" "27.79" "28.38" ...
##  $ height          : chr  "18.59" "18.36" "19.57" "19.55" ...
##  $ weight_final    : num  3.28 3.48 4.73 4.45 4.62 ...
##  $ weight_change   : chr  "8.41" "7.35" "9.98" "10.17" ...
##  $ condition_factor: chr  "0.1560" "0.1374" "0.1782" "0.1737" ...
##  $ avg_thickness   : num  0.7 0.79 0.825 0.93 0.92 0.965 0.86 0.955 0.875 0.645 ...
##  $ economic_index  : chr  "0.0018" "0.002" "0.002" "0.0021" ...

Data type and summary

Note

Weight = g
Length, width, height = mm
p450, SOD = activity/ (mg/protein)
Condition factor, economic factor = unitless

# the data types are based in str() results, make numeric anything with na in the column.
summary(mdata)

##     latitude       longitude       site_name          site_number   
##  Min.   :47.05   Min.   :-123.5   Length:312         Min.   : 1.00  
##  1st Qu.:47.33   1st Qu.:-122.7   Class :character   1st Qu.:21.00  
##  Median :47.61   Median :-122.6   Mode  :character   Median :40.00  
##  Mean   :47.71   Mean   :-122.6                      Mean   :39.66  
##  3rd Qu.:48.02   3rd Qu.:-122.4                      3rd Qu.:59.00  
##  Max.   :48.82   Max.   :-122.2                      Max.   :77.00  
##      sample            p450               SOD          weight_initial    
##  Min.   :  1.00   Min.   :       0   Min.   : -0.636   Length:312        
##  1st Qu.: 78.75   1st Qu.: 2309270   1st Qu.:  1.201   Class :character  
##  Median :156.50   Median : 3746310   Median :  5.730   Mode  :character  
##  Mean   :156.50   Mean   : 4430731   Mean   : 10.345                     
##  3rd Qu.:234.25   3rd Qu.: 5729620   3rd Qu.: 13.752                     
##  Max.   :312.00   Max.   :52717198   Max.   :133.268                     
##     length             width              height           weight_final   
##  Length:312         Length:312         Length:312         Min.   : 2.484  
##  Class :character   Class :character   Class :character   1st Qu.: 3.801  
##  Mode  :character   Mode  :character   Mode  :character   Median : 4.417  
##                                                           Mean   : 4.656  
##                                                           3rd Qu.: 5.003  
##                                                           Max.   :20.625  
##  weight_change      condition_factor   avg_thickness    economic_index    
##  Length:312         Length:312         Min.   :0.3550   Length:312        
##  Class :character   Class :character   1st Qu.:0.6800   Class :character  
##  Mode  :character   Mode  :character   Median :0.7975   Mode  :character  
##                                        Mean   :0.7835                     
##                                        3rd Qu.:0.8762                     
##                                        Max.   :1.2600

as.numeric("weight_initial", "weight_change", "length", "width", "height", "condition_factor", "economic_index")

## [1] NA

0’s and negative number management

SOD values that are negative are changed to 0 signifying that the value was undetectable.
p450 values that are 0 are missing, not actual 0’s.

#get rid of the p450 values that are 0
mdata <- mdata[mdata$p450 != 0, ]

#change any negative numbers to 0 to indicate not detectable
mdata$SOD <- ifelse(mdata$SOD <= 0, 0, mdata$SOD)

Test p450 for data normality

Interpretation- p450 data is not normally distributed

shapiro.test(mdata$p450)

## 
##  Shapiro-Wilk normality test
## 
## data:  mdata$p450
## W = 0.55896, p-value < 2.2e-16

Test SOD for data normality

Interpretation-SOD data is not normally distributed

shapiro.test(mdata$SOD)

## 
##  Shapiro-Wilk normality test
## 
## data:  mdata$SOD
## W = 0.64369, p-value < 2.2e-16

Correlation, Pearson Correlation

Interpretation - slight negative correlation between the biomarkers is not statistically significant

#individual correlation test between the biomarkers
cor.test(mdata$p450, mdata$SOD)

## 
##  Pearson's product-moment correlation
## 
## data:  mdata$p450 and mdata$SOD
## t = -1.3001, df = 302, p-value = 0.1946
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.18553865  0.03820952
## sample estimates:
##         cor 
## -0.07460345

the interpretation here is the same as above, just created the table for clear view of biomarkers with the condition_factor

#pearson correlation

# cor.test(df$site_number, df$sod, method = "pearson")
# Convert condition_factor to numeric if it's not already
mdata$condition_factor <- as.numeric(mdata$condition_factor)

# Perform Pearson correlation with NA values ignored
cor(mdata[c("p450", "SOD", "condition_factor")], use = "complete.obs", method = "pearson")

##                         p450         SOD condition_factor
## p450              1.00000000 -0.05276461       0.05468865
## SOD              -0.05276461  1.00000000      -0.02682979
## condition_factor  0.05468865 -0.02682979       1.00000000

Plot Pearson

library(corrplot)

cor.mtest <- function(mat, conf.level = 0.95){
  mat <- as.matrix(mat)
  n <- ncol(mat)
  p.mat <- matrix(NA, n, n)
  diag(p.mat) <- 0
  
  for(i in 1:(n-1)){
    for(j in (i+1):n){
      tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
      p.mat[i,j] <- tmp$p.value
      p.mat[j,i] <- tmp$p.value
    }
  }
  list(p = p.mat, conf.level = conf.level)
}

# Assuming 'mdata' and 'data' are defined and 'data' is used to subset 'mdata'
df2 <- mdata[sapply(mdata, is.numeric)] # Ensure 'mdata' is used here
df2 <- na.omit(df2)
M <- cor(df2)
testRes <- cor.mtest(df2, conf.level = 0.95)

# Visualization with corrplot
corrplot::corrplot(M,
  method = "circle", type = "lower", insig = "blank",
  addCoef.col = "black", number.cex = 0.8, order = "AOE", diag = FALSE
)

#print(cor)
#ggsave(plot=cor, filename="/Users/cmantegna/Documents/WDFWmussels/output/pearson.png", width=15, height=8)

PCA Plot with biomarkers, condition_factor and economic_factor

#install.packages("FactoMineR")
#install.packages("factoextra")
library('FactoMineR')
library("factoextra")

# Ensure that condition_factor and economic_index are numeric
mdata$condition_factor <- as.numeric(mdata$condition_factor)
mdata$economic_index <- as.numeric(mdata$economic_index)

# Remove NAs from the dataset
df_clean <- na.omit(mdata)

# Selecting the relevant variables for PCA
pca_data <- df_clean[, c("SOD", "p450", "condition_factor", "economic_index")]

# Performing PCA
pca_res <- PCA(pca_data, scale.unit = TRUE, graph = FALSE)

# Plotting the PCA
pcaplot<- fviz_pca_biplot(pca_res, label = "var", col.var = "contrib",
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
                repel = TRUE)  # Avoid text overlapping (slow if many points)



print(pcaplot)

#ggsave(plot=pcaplot, filename="/Users/cmantegna/Documents/WDFWmussels/output/pca.png", width=15, height=8)

K-means cluster/ relationship test

Interpretation- there are three clusters that vary in size and that is an indicator of a significant number of outliers, see the 01-explore code for the plot of the outliers. There are significant outliers for SOD, this may be rendering this test unhelpful. “The means of each cluster give you an idea of the centroid values around which the data points in each cluster are aggregated.”

#fixing data for kmeans
sum(is.na(mdata$p450)) # Checks for NA values in p450

## [1] 0

sum(is.nan(mdata$p450)) # Checks for NaN values in p450

## [1] 0

sum(is.infinite(mdata$p450)) # Checks for Inf values in p450

## [1] 0

sum(is.na(mdata$SOD))

## [1] 0

sum(is.nan(mdata$SOD))

## [1] 0

sum(is.infinite(mdata$SOD))

## [1] 0

sum(is.na(mdata$condition_factor))

## [1] 15

sum(is.nan(mdata$condition_factor))

## [1] 0

sum(is.infinite(mdata$condition_factor))

## [1] 0

mdata$p450 <- as.numeric(mdata$p450)
mdata$SOD <- as.numeric(mdata$SOD)
mdata$condition_factor <- as.numeric(mdata$condition_factor)

# Remove rows with NA or NaN values in specified columns
clean_data <- mdata[complete.cases(mdata[, c("p450", "SOD", "condition_factor")]), ]

# Ensure all data is numeric for the Inf check to make sense
clean_data <- transform(clean_data,
                        p450 = as.numeric(p450),
                        SOD = as.numeric(SOD),
                        condition_factor = as.numeric(condition_factor))

# Now check for and handle Inf values
clean_data <- clean_data[!is.infinite(clean_data$p450) & !is.infinite(clean_data$SOD) & !is.infinite(clean_data$condition_factor), ]

kmeans_result <- kmeans(clean_data[, c("p450", "SOD", "condition_factor")], centers = 3)

print(kmeans_result)

## K-means clustering with 3 clusters of sizes 2, 92, 195
## 
## Cluster means:
##       p450       SOD condition_factor
## 1 42317433 17.459500        0.1755000
## 2  7282065  5.758826        0.2374457
## 3  3019304 10.576467        0.1934713
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   2   3   3   3   3   2   3   3   3   2   3   2   2   3   3   3   3   3   3   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   3   3   3   3   3   3   2   2   2   3   3   3   2   3   3   3   3   3   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  54  56  57  58  59  60  61  62 
##   3   3   3   3   3   3   3   3   3   3   2   2   3   3   3   3   3   2   3   2 
##  63  64  65  66  67  68  69  70  71  72  73  74  75  80  81  82  83  84  85  86 
##   2   2   2   3   3   2   2   3   3   3   3   3   3   3   3   2   3   2   3   2 
##  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 
##   3   3   2   1   3   3   3   2   2   3   3   3   3   3   2   2   2   2   2   2 
## 107 108 113 114 115 116 117 118 119 120 121 122 123 124 125 128 129 131 132 133 
##   2   3   2   2   3   2   3   3   3   2   3   2   3   2   3   3   3   3   3   2 
## 134 135 136 137 138 139 140 141 146 147 148 149 150 151 152 153 154 155 156 157 
##   3   3   3   3   3   2   3   2   2   2   3   2   2   2   3   2   3   3   3   3 
## 158 159 160 161 162 163 164 165 166 167 168 170 171 172 173 174 175 180 181 182 
##   2   2   3   2   2   3   3   3   3   3   3   3   3   3   2   3   3   2   3   2 
## 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 
##   2   3   2   2   2   2   2   3   3   3   3   3   3   2   1   3   3   3   3   3 
## 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 
##   3   3   3   3   3   2   3   3   3   3   3   3   3   2   2   3   3   2   3   2 
## 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 
##   2   3   3   3   3   3   2   2   2   3   3   2   2   3   2   3   3   3   3   3 
## 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 
##   3   2   3   3   3   3   3   2   3   3   3   3   3   3   3   3   3   3   3   3 
## 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 
##   3   3   3   3   3   3   3   2   3   3   3   3   2   3   3   2   2   2   3   2 
## 283 284 285 286 287 288 290 291 292 293 294 295 296 297 298 299 300 301 302 303 
##   3   2   3   3   3   3   2   2   2   3   2   2   3   2   2   3   2   3   3   3 
## 304 305 306 307 308 309 310 311 312 
##   3   2   3   2   3   2   3   3   3 
## 
## Within cluster sum of squares by cluster:
## [1] 2.163102e+14 3.444550e+14 2.716409e+14
##  (between_SS / total_SS =  82.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

02- Basic Statistics