model_optimization.knit

## vector with package names x <- c("pbapply", "ggbeeswarm", "ggplot2", "viridis","cluster", "dplyr", "magrittr", "reshape", "plyr", "ade4", "svMisc", "geometry", "ape") aa <- lapply(x, function(y) { # check if installed, if not then install if (!y %in% installed.packages()[,"Package"]) install.packages(y) # load package try(require(y, character.only = T), silent = T) })

para <- parallel::detectCores() - 2 theme_set(theme_classic(base_size = 20)) knitr::opts_chunk$set(dpi = 100, fig.width = 12, warning = FALSE, root.dir = "..") knitr::opts_knit$set(root.dir = "..")

source("./scripts/assemblage_random_subsets.R")

# species_abundances <- readRDS("./data/species_abundances.rds") abundance_and_func_traits <- readRDS("~/Dropbox/Projects/eastern_tropical_pacific_herbivory/data_intermediate/fish/regional/herbfishes_regional_ecoregion_traits_biomass.rds") # transect_areas1 <- readRDS("./data/transect_areas.RDS") transect_areas <- abundance_and_func_traits[!duplicated(paste(abundance_and_func_traits$ID_transect,abundance_and_func_traits$site_id)), c("ID_transect", "site_id", "area_uvc", "region", "date")] transect_areas$ID_transect_num <- 1:nrow(transect_areas) # Max number of sites a transect is found in (should be 1!) max(table(transect_areas$ID_transect, transect_areas$site_id))

## [1] 1

predictors <- readRDS("~/Dropbox/Projects/eastern_tropical_pacific_herbivory/data_intermediate/predictors.RDS") # these ones don't have coordinates (should be 4 sites from mexico) setdiff(transect_areas$site_id, predictors$site_id)

## character(0)

transect_areas <- merge(transect_areas, predictors[!duplicated(predictors$site_id), ], by = "site_id") transect_areas$years_protected[is.na(transect_areas$years_protected)] <- 0 transect_areas$ID_transect_num <- 1:nrow(transect_areas)

subtransect_areas <- transect_areas[transect_areas$Ecoregion == "Revillagigedos" & !is.na(transect_areas$Ecoregion), ] set.seed(1001) FD_test <- assemblage_random_subsets(transc_metadata = subtransect_areas, sp_abund = abundance_and_func_traits, reps = 100, buffer = 0.1, parallel = 1, target.area = 600, overrep.cutoff = 0.9, site.specific.biomass = TRUE) head(FD_test[FD_test$taxa_n > 3, ]) # # sapply(FD_test, ncol) FD_results <- assemblage_random_subsets(X = transect_areas, reps = 1000, buffer = 0.1, parallel = para, target.area = 500) # # saveRDS(FD_results, "preliminary_results_random_sample_of_transects.RDS") # FDres <- readRDS("preliminary_results_random_sample_of_transects.RDS") # nrow(FDres) # anyDuplicated(FDres$ID_transect_num) / nrow(FDres) # # FDres <- FDres[!duplicated(FDres$ID_transect_num), ] # # FDres <- FDres[!is.na(FDres$Nb_sp), ] # # nrow(FDres) # # agg_sites <- aggregate(ID_transect_num ~ sites + locality, data = transect_areas, # FUN = function(x) length(unique(x))) # nrow(agg_sites) # # agg_random_samples <- aggregate(ID_transect_num ~ taxa_n + sites + locality, data = FDres, # FUN = function(x) length(unique(x))) # # # length(unique(agg_sites$locality)) # length(unique(agg_random_samples$locality)) # # nrow(agg_random_samples) # # # agg <- aggregate(area_uvc ~ sites + locality, data = transect_areas, # FUN = sum) # # # hist(agg$area_uvc, breaks = 20) # # agg_sites <- aggregate(ID_transect_num ~ sites + locality, data = transect_areas, # FUN = function(x) length(unique(x))) #

param_grid <- expand.grid(buffer = seq(0, 0.3, 0.05), target.area = seq(200, 800, 50)) pboptions(type = "none") # loop to test different parameters for(i in 1:nrow(param_grid)){ progress(value = i, max.value = nrow(param_grid), init = (i == 1), progress.bar = TRUE, char = "+") if (!file.exists(file.path("./output/fd_by_target_area_and_buffer_1000_reps",paste0("random_sample_of_transects_buffer_", param_grid$buffer[i], "_target_area_", param_grid$target.area[i], ".RDS")))) { fdres <- try(assemblage_random_subsets(transc_metadata = transect_areas, sp_abund = abundance_and_func_traits, reps = 1000, buffer = param_grid$buffer[i], parallel = para, target.area = param_grid$target.area[i], site.specific.biomass = TRUE, pb = FALSE), silent = TRUE) try(saveRDS(fdres, file.path("./output/fd_by_target_area_and_buffer_1000_reps",paste0("random_sample_of_transects_buffer_", param_grid$buffer[i], "_target_area_", param_grid$target.area[i], ".RDS"))), silent = TRUE) rm(fdres) } }

FD_results <- assemblage_random_subsets(X = transect_areas, Y = species_abundances, Z = functional_traits, reps = 10000, buffer = 0.1, parallel = para, target.area = 500, site.specific.biomass = TRUE) saveRDS(FD_results, "../output/preliminary_results_random_sample_of_transects.RDS")

rdss <- list.files(path = "./output/fd_by_target_area_and_buffer_1000_reps/", full.names = TRUE) transect_areas$ID_transect_num <- 1:nrow(transect_areas) orignal_ecoregion_n <- length(unique(transect_areas$Ecoregion)) orignal_locality_n <- length(unique(transect_areas$locality)) orignal_site_n <- length(unique(transect_areas$site_id)) orignal_transect_n <- nrow(transect_areas) original_total_area <- sum(transect_areas$area_uvc) out <- pbapply::pblapply(rdss, cl = 4, function(x){ rds <- readRDS(x) rds <- as.data.frame(rds) rds$Nb_sp <- apply(rds, 1, function(x) sum(x > 0)) rds$sites <- sapply(strsplit(rownames(rds), "/", fixed = TRUE), "[", 1) rds$ID_transect_num <- sapply(strsplit(rownames(rds), "/", fixed = TRUE), "[", 2) # rds <- rds[!duplicated(rds$ID_transect_num), ] # rds <- rds[!is.na(rds$Nb_sp), ] rds$ecoregion <- sapply(1:nrow(rds), function(y) transect_areas$Ecoregion[transect_areas$site_id == rds$sites[y]][1]) rds$locality <- sapply(1:nrow(rds), function(y) transect_areas$locality[transect_areas$site_id == rds$sites[y]][1]) buffer <- as.numeric(sapply(strsplit(basename(x), "_", fixed = TRUE), "[", 6)) target_area <- as.numeric(gsub(".RDS", "", sapply(strsplit(basename(x), "_", fixed = TRUE), "[", 9))) ecoregion_n <- round(length(unique(rds$ecoregion)) / orignal_ecoregion_n, 2) locality_n <- round(length(unique(rds$locality)) / orignal_locality_n, 2) sites_n <- round(length(unique(rds$sites)) / orignal_site_n, 2) ID_transect_num <- unlist(strsplit(rds$ID_transect_num, "-", fixed = TRUE)) transect_n <- round(length(unique(ID_transect_num)) / orignal_transect_n, 2) total_area <- sum(transect_areas$area_uvc[transect_areas$ID_transect_num %in% ID_transect_num]) / original_total_area mean_site_random_samples <- mean(table(rds$sites)) # transect over-representation index tb <- table(unlist(sapply(rds$ID_transect_num, function(x) strsplit(x, "-", fixed = TRUE)))) transect_overrepresentation <- sd(tb / mean(tb)) df <- data.frame(call = basename(x), buffer, target_area, ecoregion_n, locality_n, sites_n, transect_n, total_area, transect_overrepresentation, mean_site_random_samples) return(df) }) model_parameters <- do.call(rbind, out) write.csv(model_parameters, "./output/model_parameters.csv", row.names = FALSE)

model_parameters <- read.csv("./output/model_parameters.csv") model_parameters$buffer <- factor(model_parameters$buffer, levels = sort(unique(model_parameters$buffer))) model_parameters$target_area <- factor(model_parameters$target_area, levels = sort(unique(model_parameters$target_area))) params_grid <- expand.grid(settings = c("buffer", "target_area"), parameters = c("total_area", "transect_n", "transect_overrepresentation", "locality_n", "sites_n", "ecoregion_n", "mean_site_random_samples"), stringsAsFactors = FALSE) params_grid <- params_grid[order(params_grid$settings, decreasing = TRUE), ] ggs <- lapply(1:nrow(params_grid), function(x){ agg <- aggregate(as.formula(paste(params_grid$parameters[x], "~", params_grid$settings[x])), model_parameters, mean, na.rm = TRUE) gg <- ggplot(model_parameters, aes(x = get(params_grid$settings[x]), y = get(params_grid$parameters[x]))) + geom_violin(fill = viridis(10, alpha = 0.5)[which(unique(params_grid$parameters) %in% params_grid$parameters[x])]) + geom_point(data = agg, aes(x = get(params_grid$settings[x]), y = get(params_grid$parameters[x])), size = 3) + geom_beeswarm(alpha = 0.4) + labs(x = params_grid$settings[x], y = params_grid$parameters[x]) + theme(legend.position = "none") if (params_grid$parameters[x] == "transect_overrepresentation") gg <- gg + ylim(c(0, max(model_parameters$transect_overrepresentation, na.rm = TRUE))) else if (params_grid$parameters[x] == "mean_site_random_samples") gg <- gg + ylim(c(0, max(model_parameters$mean_site_random_samples))) else if (params_grid$parameters[x] %in% c("locality_n", "sites_n", "ecoregion_n")) gg <- gg + ylim(c(0, 1)) else return(gg) }) names(ggs) <- apply(params_grid, 1, paste, collapse = " vs ") ggs

## $`target_area vs total_area`

## ## $`target_area vs transect_n`

## ## $`target_area vs transect_overrepresentation`

## ## $`target_area vs locality_n`

## ## $`target_area vs sites_n`

## ## $`target_area vs ecoregion_n`

## ## $`target_area vs mean_site_random_samples`

## ## $`buffer vs total_area`

## ## $`buffer vs transect_n`

## ## $`buffer vs transect_overrepresentation`

## ## $`buffer vs locality_n`

## ## $`buffer vs sites_n`

## ## $`buffer vs ecoregion_n`

## ## $`buffer vs mean_site_random_samples`

mod <- model_parameters$call[which.max(model_parameters$locality_n)] rds <- readRDS(file.path("./output/fd_by_target_area_and_buffer_1000_reps/", mod)) rds <- as.data.frame(rds) rds$Nb_sp <- apply(rds, 1, function(x) sum(x > 0)) rds$sites <- sapply(strsplit(rownames(rds), "/", fixed = TRUE), "[", 1) rds$ID_transect_num <- sapply(strsplit(rownames(rds), "/", fixed = TRUE), "[", 2) rds$ecoregion <- sapply(1:nrow(rds), function(y) transect_areas$Ecoregion[transect_areas$site_id == rds$sites[y]][1]) rds$locality <- sapply(1:nrow(rds), function(y) transect_areas$locality[transect_areas$site_id == rds$sites[y]][1]) # ecoregions lost # setdiff(unique(transect_areas$ecoregion), unique(rds$ecoregion)) lost_localities <- setdiff(unique(transect_areas$locality), unique(rds$locality))

The maximum percentage of the total area from any model that can be included is 97%

These are the localities that are lost no matter which model parameters are used:

el_faro

These localities represent 1% of the sampled area.

R session information

## R version 4.1.0 (2021-05-18) ## Platform: x86_64-pc-linux-gnu (64-bit) ## Running under: Ubuntu 20.04.2 LTS ## ## Matrix products: default ## BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3 ## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3 ## ## locale: ## [1] LC_CTYPE=pt_BR.UTF-8 LC_NUMERIC=C ## [3] LC_TIME=es_CR.UTF-8 LC_COLLATE=pt_BR.UTF-8 ## [5] LC_MONETARY=es_CR.UTF-8 LC_MESSAGES=pt_BR.UTF-8 ## [7] LC_PAPER=es_CR.UTF-8 LC_NAME=C ## [9] LC_ADDRESS=C LC_TELEPHONE=C ## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] ape_5.5 geometry_0.4.5 svMisc_1.1.4 ade4_1.7-17 ## [5] plyr_1.8.6 reshape_0.8.8 magrittr_2.0.3 dplyr_1.0.7 ## [9] cluster_2.1.2 viridis_0.6.2 viridisLite_0.4.0 ggbeeswarm_0.6.0 ## [13] ggplot2_3.3.5 pbapply_1.5-0 ## ## loaded via a namespace (and not attached): ## [1] Rcpp_1.0.8.3 lattice_0.20-44 assertthat_0.2.1 digest_0.6.29 ## [5] utf8_1.2.2 R6_2.5.1 magic_1.5-9 evaluate_0.15 ## [9] highr_0.9 pillar_1.6.4 rlang_1.0.1 rstudioapi_0.13 ## [13] jquerylib_0.1.4 rmarkdown_2.13 labeling_0.4.2 stringr_1.4.0 ## [17] munsell_0.5.0 compiler_4.1.0 vipor_0.4.5 xfun_0.30 ## [21] pkgconfig_2.0.3 htmltools_0.5.2 tidyselect_1.1.1 tibble_3.1.6 ## [25] gridExtra_2.3 fansi_1.0.0 crayon_1.5.0 withr_2.4.3 ## [29] MASS_7.3-54 grid_4.1.0 nlme_3.1-152 jsonlite_1.8.0 ## [33] gtable_0.3.0 lifecycle_1.0.1 DBI_1.1.1 scales_1.1.1 ## [37] cli_3.1.0 stringi_1.7.6 farver_2.1.0 bslib_0.2.5.1 ## [41] ellipsis_0.3.2 generics_0.1.0 vctrs_0.3.8 tools_4.1.0 ## [45] glue_1.6.2 beeswarm_0.4.0 purrr_0.3.4 abind_1.4-5 ## [49] parallel_4.1.0 fastmap_1.1.0 yaml_2.3.5 colorspace_2.0-2 ## [53] knitr_1.38 sass_0.4.0

Random sampling transect model optimization

Coral reef functional diversity/b>

Marcelo Araya-Salas, Andrea Arriaga et al

06-04-2022