knitr::opts_chunk$set(
echo = TRUE, # Display code chunks
eval = TRUE, # Evaluate code chunks
warning = FALSE, # Hide warnings
message = FALSE, # Hide messages
fig.width = 8, # Set plot width in inches
fig.height = 5, # Set plot height in inches
fig.align = "center" # Align plots to the center
)
library(tidyr)
library(tidyverse)
library(ggplot2)
library(vegan)
getwd()
## [1] "/Users/cmantegna/Documents/WDFWmussels/code"
#data has all sites, coordinates, p450, sod, condition factor, economic factor data
mdata<- read.csv("/Users/cmantegna/Documents/WDFWmussels/data/biomarkerfull.csv")
str(mdata)
## 'data.frame': 312 obs. of 16 variables:
## $ latitude : num 48.7 48.7 48.7 48.7 47.5 ...
## $ longitude : num -123 -123 -123 -123 -122 ...
## $ site_name : chr "Aiston Preserve" "Aiston Preserve" "Aiston Preserve" "Aiston Preserve" ...
## $ site_number : int 77 77 77 77 13 13 13 13 13 13 ...
## $ sample : int 239 240 241 242 281 282 283 284 285 286 ...
## $ p450 : int 5965780 1508156 4674882 2861653 3448794 6485447 3563340 1813227 1987132 9587219 ...
## $ SOD : num 0 4.88 8.87 0.01 7.08 ...
## $ weight_initial : chr "11.6884" "10.833" "14.7041" "14.6121" ...
## $ length : chr "53.9" "53.49" "55.99" "58.55" ...
## $ width : chr "22.73" "23.92" "27.79" "28.38" ...
## $ height : chr "18.59" "18.36" "19.57" "19.55" ...
## $ weight_final : num 3.28 3.48 4.73 4.45 4.62 ...
## $ weight_change : chr "8.41" "7.35" "9.98" "10.17" ...
## $ condition_factor: chr "0.1560" "0.1374" "0.1782" "0.1737" ...
## $ avg_thickness : num 0.7 0.79 0.825 0.93 0.92 0.965 0.86 0.955 0.875 0.645 ...
## $ economic_index : chr "0.0018" "0.002" "0.002" "0.0021" ...
Weight = g
Length, width, height = mm
p450, SOD = activity/ (mg/protein)
Condition factor, economic factor = unitless
# the data types are based in str() results, make numeric anything with na in the column.
summary(mdata)
## latitude longitude site_name site_number
## Min. :47.05 Min. :-123.5 Length:312 Min. : 1.00
## 1st Qu.:47.33 1st Qu.:-122.7 Class :character 1st Qu.:21.00
## Median :47.61 Median :-122.6 Mode :character Median :40.00
## Mean :47.71 Mean :-122.6 Mean :39.66
## 3rd Qu.:48.02 3rd Qu.:-122.4 3rd Qu.:59.00
## Max. :48.82 Max. :-122.2 Max. :77.00
## sample p450 SOD weight_initial
## Min. : 1.00 Min. : 0 Min. : -0.636 Length:312
## 1st Qu.: 78.75 1st Qu.: 2309270 1st Qu.: 1.201 Class :character
## Median :156.50 Median : 3746310 Median : 5.730 Mode :character
## Mean :156.50 Mean : 4430731 Mean : 10.345
## 3rd Qu.:234.25 3rd Qu.: 5729620 3rd Qu.: 13.752
## Max. :312.00 Max. :52717198 Max. :133.268
## length width height weight_final
## Length:312 Length:312 Length:312 Min. : 2.484
## Class :character Class :character Class :character 1st Qu.: 3.801
## Mode :character Mode :character Mode :character Median : 4.417
## Mean : 4.656
## 3rd Qu.: 5.003
## Max. :20.625
## weight_change condition_factor avg_thickness economic_index
## Length:312 Length:312 Min. :0.3550 Length:312
## Class :character Class :character 1st Qu.:0.6800 Class :character
## Mode :character Mode :character Median :0.7975 Mode :character
## Mean :0.7835
## 3rd Qu.:0.8762
## Max. :1.2600
as.numeric("weight_initial", "weight_change", "length", "width", "height", "condition_factor", "economic_index")
## [1] NA
SOD values that are negative are changed to 0 signifying that the
value was undetectable.
p450 values that are 0 are missing, not actual 0’s.
#get rid of the p450 values that are 0
mdata <- mdata[mdata$p450 != 0, ]
#change any negative numbers to 0 to indicate not detectable
mdata$SOD <- ifelse(mdata$SOD <= 0, 0, mdata$SOD)
shapiro.test(mdata$p450)
##
## Shapiro-Wilk normality test
##
## data: mdata$p450
## W = 0.55896, p-value < 2.2e-16
shapiro.test(mdata$SOD)
##
## Shapiro-Wilk normality test
##
## data: mdata$SOD
## W = 0.64369, p-value < 2.2e-16
#individual correlation test between the biomarkers
cor.test(mdata$p450, mdata$SOD)
##
## Pearson's product-moment correlation
##
## data: mdata$p450 and mdata$SOD
## t = -1.3001, df = 302, p-value = 0.1946
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.18553865 0.03820952
## sample estimates:
## cor
## -0.07460345
#pearson correlation
# cor.test(df$site_number, df$sod, method = "pearson")
# Convert condition_factor to numeric if it's not already
mdata$condition_factor <- as.numeric(mdata$condition_factor)
# Perform Pearson correlation with NA values ignored
cor(mdata[c("p450", "SOD", "condition_factor")], use = "complete.obs", method = "pearson")
## p450 SOD condition_factor
## p450 1.00000000 -0.05276461 0.05468865
## SOD -0.05276461 1.00000000 -0.02682979
## condition_factor 0.05468865 -0.02682979 1.00000000
library(corrplot)
cor.mtest <- function(mat, conf.level = 0.95){
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
for(i in 1:(n-1)){
for(j in (i+1):n){
tmp <- cor.test(mat[,i], mat[,j], conf.level = conf.level)
p.mat[i,j] <- tmp$p.value
p.mat[j,i] <- tmp$p.value
}
}
list(p = p.mat, conf.level = conf.level)
}
# Assuming 'mdata' and 'data' are defined and 'data' is used to subset 'mdata'
df2 <- mdata[sapply(mdata, is.numeric)] # Ensure 'mdata' is used here
df2 <- na.omit(df2)
M <- cor(df2)
testRes <- cor.mtest(df2, conf.level = 0.95)
# Visualization with corrplot
corrplot::corrplot(M,
method = "circle", type = "lower", insig = "blank",
addCoef.col = "black", number.cex = 0.8, order = "AOE", diag = FALSE
)
#print(cor)
#ggsave(plot=cor, filename="/Users/cmantegna/Documents/WDFWmussels/output/pearson.png", width=15, height=8)
#install.packages("FactoMineR")
#install.packages("factoextra")
library('FactoMineR')
library("factoextra")
# Ensure that condition_factor and economic_index are numeric
mdata$condition_factor <- as.numeric(mdata$condition_factor)
mdata$economic_index <- as.numeric(mdata$economic_index)
# Remove NAs from the dataset
df_clean <- na.omit(mdata)
# Selecting the relevant variables for PCA
pca_data <- df_clean[, c("SOD", "p450", "condition_factor", "economic_index")]
# Performing PCA
pca_res <- PCA(pca_data, scale.unit = TRUE, graph = FALSE)
# Plotting the PCA
pcaplot<- fviz_pca_biplot(pca_res, label = "var", col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE) # Avoid text overlapping (slow if many points)
print(pcaplot)
#ggsave(plot=pcaplot, filename="/Users/cmantegna/Documents/WDFWmussels/output/pca.png", width=15, height=8)
#fixing data for kmeans
sum(is.na(mdata$p450)) # Checks for NA values in p450
## [1] 0
sum(is.nan(mdata$p450)) # Checks for NaN values in p450
## [1] 0
sum(is.infinite(mdata$p450)) # Checks for Inf values in p450
## [1] 0
sum(is.na(mdata$SOD))
## [1] 0
sum(is.nan(mdata$SOD))
## [1] 0
sum(is.infinite(mdata$SOD))
## [1] 0
sum(is.na(mdata$condition_factor))
## [1] 15
sum(is.nan(mdata$condition_factor))
## [1] 0
sum(is.infinite(mdata$condition_factor))
## [1] 0
mdata$p450 <- as.numeric(mdata$p450)
mdata$SOD <- as.numeric(mdata$SOD)
mdata$condition_factor <- as.numeric(mdata$condition_factor)
# Remove rows with NA or NaN values in specified columns
clean_data <- mdata[complete.cases(mdata[, c("p450", "SOD", "condition_factor")]), ]
# Ensure all data is numeric for the Inf check to make sense
clean_data <- transform(clean_data,
p450 = as.numeric(p450),
SOD = as.numeric(SOD),
condition_factor = as.numeric(condition_factor))
# Now check for and handle Inf values
clean_data <- clean_data[!is.infinite(clean_data$p450) & !is.infinite(clean_data$SOD) & !is.infinite(clean_data$condition_factor), ]
kmeans_result <- kmeans(clean_data[, c("p450", "SOD", "condition_factor")], centers = 3)
print(kmeans_result)
## K-means clustering with 3 clusters of sizes 2, 92, 195
##
## Cluster means:
## p450 SOD condition_factor
## 1 42317433 17.459500 0.1755000
## 2 7282065 5.758826 0.2374457
## 3 3019304 10.576467 0.1934713
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 3 3 3 3 2 3 3 3 2 3 2 2 3 3 3 3 3 3 3
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 3 3 3 3 3 3 3 2 2 2 3 3 3 2 3 3 3 3 3 3
## 41 42 43 44 45 46 47 48 49 50 51 52 54 56 57 58 59 60 61 62
## 3 3 3 3 3 3 3 3 3 3 2 2 3 3 3 3 3 2 3 2
## 63 64 65 66 67 68 69 70 71 72 73 74 75 80 81 82 83 84 85 86
## 2 2 2 3 3 2 2 3 3 3 3 3 3 3 3 2 3 2 3 2
## 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 3 3 2 1 3 3 3 2 2 3 3 3 3 3 2 2 2 2 2 2
## 107 108 113 114 115 116 117 118 119 120 121 122 123 124 125 128 129 131 132 133
## 2 3 2 2 3 2 3 3 3 2 3 2 3 2 3 3 3 3 3 2
## 134 135 136 137 138 139 140 141 146 147 148 149 150 151 152 153 154 155 156 157
## 3 3 3 3 3 2 3 2 2 2 3 2 2 2 3 2 3 3 3 3
## 158 159 160 161 162 163 164 165 166 167 168 170 171 172 173 174 175 180 181 182
## 2 2 3 2 2 3 3 3 3 3 3 3 3 3 2 3 3 2 3 2
## 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
## 2 3 2 2 2 2 2 3 3 3 3 3 3 2 1 3 3 3 3 3
## 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
## 3 3 3 3 3 2 3 3 3 3 3 3 3 2 2 3 3 2 3 2
## 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
## 2 3 3 3 3 3 2 2 2 3 3 2 2 3 2 3 3 3 3 3
## 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
## 3 2 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3
## 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
## 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 2 2 2 3 2
## 283 284 285 286 287 288 290 291 292 293 294 295 296 297 298 299 300 301 302 303
## 3 2 3 3 3 3 2 2 2 3 2 2 3 2 2 3 2 3 3 3
## 304 305 306 307 308 309 310 311 312
## 3 2 3 2 3 2 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2.163102e+14 3.444550e+14 2.716409e+14
## (between_SS / total_SS = 82.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"