## vector with package names
x <- c("pbapply", "parallel", "knitr", "NormalGamma", "DT")
aa <- lapply(x, function(y) {
# check if installed, if not then install
if (!y %in% installed.packages()[,"Package"])
install.packages(y)
# load package
try(require(y, character.only = T), silent = T)
})
spp
)nvars
) drawn from a normal-gamma distribution (gamma was used to avid negative values)dimorphic_vars
)effect_size
)covar_vars
). Covariance decreases steadly from the first variable. For instance, if covar_vars = 3
then variable 2 shows the highest covariance with 1 and 3 the highest covariance with 2.varying_and_dimorphic
)Three main scenarios where simulated:
Each scenario was run with 3 different numbers of dimorphic variables (10, 3 and 0 (no dimorphism)) and three effect sizes 0.5
Each model was replicated 1000 times. Each replicate returned the p-value of linear regression models for the dimorphism metrics as response (1 at the time) and the dimorphism categories (high, low and monomorphic) as predictor. So 3 p-values for each replicate. Dimorphism categories were encoded as ordered nominal variable
The statistical power and false positive rate (type I error) were calculated for each dimorphism metric. Power was defined as the proportion of significant p-values (alpha = 0.05) when there is actual dimorphism (10 or 3 dimorphic variables). A good test should have a power close to 1 when the sample size is decent, as in these simulations. False positive rate was measured as the proportion of significant p-values when there is no dimorphism (0 dimorphic variables)
options(scipen = 1, digits = 2)
opts_knit$set(root.dir = normalizePath(".."))
# calculate euclidean distance weighted by PC explained variance
weight_dist <- function(X, pca) {
# all (q-p)^2
q_p2 <- sapply(1:ncol(X), function(y) ((X[1, y] - X[2, y]) ^ 2))
# weight by PCA explained variance
weight_q_p2 <- q_p2 * (summary(pca)$importance[2, ] / max(summary(pca)$importance[2, ]))
# square root of sum
weight_d <- sqrt(sum(weight_q_p2))
return(weight_d)
}
# extract p values from lm models
lmp <- function (modelobject) {
if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
f <- summary(modelobject)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
# simulating function
sim_pca_dimorph <- function(spp = 117, nvars = 13, dimorphic_vars = 3, covar_vars = 5, varying_and_dimorphic = "yes", same_dimorphic = TRUE, effect_size = 2){
dimorphism <- sample(c("monomorphic", "low dimorphism", "high dimorphism"), size = spp, replace = TRUE)
dimorphism_df <- replicate(n = nvars, rep("monomorphic", spp))
if (dimorphic_vars > 0)
for( i in 1:spp){
if (same_dimorphic){
if (dimorphism[i] == "low dimorphism")
dimorphism_df[i, 1:dimorphic_vars] <- "low dimorphism"
if (dimorphism[i] == "high dimorphism")
dimorphism_df[i, 1:dimorphic_vars] <- "high dimorphism"
} else {
if (dimorphism[i] == "low dimorphism")
dimorphism_df[i, sample(1:13, dimorphic_vars)] <- "low dimorphism"
if (dimorphism[i] == "high dimorphism")
dimorphism_df[i, sample(1:13, dimorphic_vars)] <- "high dimorphism"
}
}
sds <- sample(seq(1, 3, length.out = 3), size = nvars, replace = TRUE)
if (varying_and_dimorphic != "random")
sds <- sort(sds, decreasing = if(varying_and_dimorphic == "yes") TRUE else FALSE)
means <- sample(seq(10, 100, length.out = nvars))
males_vars <- list()
for(x in 1:nvars)
{
if(x <= covar_vars & x != 1)
males <- males_vars[[x - 1]] + rnorm(n = spp, mean = 0, sd = sds[x - 1]) else
males <- rnorm(n = spp, mean= means[x], sd=sds[x]) + rgamma(n = spp, shape= 2, scale= 2)
males_vars[[x]] <- males
}
vars <- list()
for(w in 1:nvars)
{
females <- males_vars[[w]]
# monomorphic
females[dimorphism_df[ , w] == "monomorphic"] <- females[dimorphism_df[ , w] == "monomorphic"] + rnorm(n = sum(dimorphism_df[ , w] == "monomorphic"), mean = 0, sd = sd(males) * 1/8)
# low dimorphism
females[dimorphism_df[ , w] == "low dimorphism"] <- females[dimorphism_df[ , w] == "low dimorphism"] + rnorm(n = sum(dimorphism_df[ , w] == "low dimorphism"), mean = 0, sd = sd(males) * 1/8 * effect_size)
# high dimorphism
females[dimorphism_df[ , w] == "high dimorphism"] <- females[dimorphism_df[ , w] == "high dimorphism"] + rnorm(n = sum(dimorphism_df[ , w] == "high dimorphism"), mean = 0, sd(males) * 1/8 * (effect_size^2))
# plot(males_vars[[w]], females)
vars[[w]] <- c(males_vars[[w]], females)
}
# put morpho variables together in a data frame
morph_dat <- as.data.frame(do.call(cbind, vars))
pca <- prcomp(morph_dat, scale. = TRUE)
sp_labs <- paste0(rep("sp", spp), 1:spp)
sp_labs <- c(sp_labs, sp_labs)
# regular euclidean distance
dists <- sapply(unique(sp_labs), function(x) stats::dist(pca$x[sp_labs == x, ]))
dist_df <- data.frame(dimorphism = dimorphism, dists = dists)
dist_df$dimorphism <- factor(dist_df$dimorphism, levels = c("monomorphic", "low dimorphism", "high dimorphism"))
dist_df$dimorphism <- ordered(dist_df$dimorphism)
# weighted distances
weight_dists <- sapply(unique(sp_labs), function(x) weight_dist(X = pca$x[sp_labs == x, ], pca))
weight_dist_df <- data.frame(dimorphism = dist_df$dimorphism, dists = weight_dists)
# ratios
ratios <- morph_dat[1:spp, ] / morph_dat[(spp + 1):(spp * 2), ]
# check ratios
# apply(ratios, 2, range)
# apply(abs(ratios -1), 2, range)
ratio_pca <- prcomp(abs(ratios - 1), scale. = FALSE)
ratio_df <- data.frame(dimorphism = dist_df$dimorphism, dists = scale(ratio_pca$x[, 1]))
return(list(morph_dat = morph_dat, dist_df = dist_df, weight_dist_df = weight_dist_df, ratio_df = ratio_df, pcas = list(raw_vars = summary(pca), ratios = summary(ratio_pca))))
}
# replicate model function
# model_call <- call("sim_pca_dimorph", dimorphic_vars = 7, covar_vars = 7, varying_and_dimorphic = FALSE)
repicate_models <- function(model, n = 1000){
ps <- replicate(n = n, simplify = FALSE, expr = {
# eval model
dimorphs <- try(eval(model), silent = TRUE)
if (!is(dimorphs, "try-error")){
# run models
lm_regular <- lm(dists ~ dimorphism, data = dimorphs$dist_df)
lm_weight <- lm(dists ~ dimorphism, data = dimorphs$weight_dist_df)
lm_ratio <- lm(dists ~ dimorphism, data = dimorphs$ratio_df)
out <- list(regular = dimorphs$dist_df, weighted = dimorphs$weight_dist_df, ratios = dimorphs$ratio_df)
mean_sds <- lapply(1:length(out), function(x) {
mns <- stats::aggregate(formula = dists ~ dimorphism, data = out[[x]], FUN = base::mean)
mns$sd <- stats::aggregate(formula = dists ~ dimorphism, data = out[[x]], FUN = stats::sd)[, 2]
mns$data <- names(out)[x]
return(mns)
})
mean_sds <- do.call(rbind, mean_sds)
mean_sds$p_value <- c(lmp(lm_regular), NA, NA,
lmp(lm_weight), NA, NA,
lmp(lm_ratio), NA, NA)
} else
mean_sds <- NULL
return(mean_sds)
})
ps <- do.call(rbind, ps)
p_regular <- sum(ps$p_value[ps$data == "regular" & !is.na(ps$p_value)] < 0.05) / sum(ps$data == "regular" & !is.na(ps$p_value))
p_weight <- sum(ps$p_value[ps$data == "weighted" & !is.na(ps$p_value)] < 0.05) / sum(ps$data == "weighted" & !is.na(ps$p_value))
p_ratio <- sum(ps$p_value[ps$data == "ratios" & !is.na(ps$p_value)] < 0.05) / sum(ps$data == "ratios" & !is.na(ps$p_value))
mean_sd <- aggregate(cbind(dists, sd) ~ dimorphism + data, data = ps, FUN = mean)
return(list(ps = data.frame(p_regular = p_regular, p_weight = p_weight, p_ratio = p_ratio), mean_sd = mean_sd))
}
This is an example data set generated by the simulation. In this case we used 10 dimorphic variables, 8 covarying variables and an effect size of 2
set.seed(54)
dimorphs <- sim_pca_dimorph(dimorphic_vars = 10, covar_vars = 8, varying_and_dimorphic = "random", effect_size = 2)
# as.data.frame(dimorphs$pcas$raw_vars$importance)
cols <- as.character(rep(dimorphs$dist_df$dimorphism, 2))
cols[cols == "monomorphic"] <- "gray"
cols[cols == "low dimorphism"] <- "orange"
cols[cols == "high dimorphism"] <- "red"
par(mfrow = c(5, 3))
# par(mfrow = c(2, 2))
for (i in 1:ncol(dimorphs$morph_dat))
# for (i in 1:4)
plot(c(dimorphs$morph_dat[1:(nrow(dimorphs$morph_dat) / 2), i]), c(dimorphs$morph_dat[((nrow(dimorphs$morph_dat) / 2) + 1):((nrow(dimorphs$morph_dat) / 2) * 2), i]), xlab = paste("Variable", i, "males"), ylab = paste("Variable", i, "females"), col = cols)
par(mfrow = c(1, 1))
On raw morphological data:
as.data.frame(dimorphs$pcas$raw_vars$importance)
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Standard deviation | 2.52 | 1.14 | 1.11 | 1.03 | 0.97 | 0.87 | 0.83 | 0.53 | 0.39 | 0.29 | 0.25 | 0.22 | 0.21 |
Proportion of Variance | 0.49 | 0.10 | 0.10 | 0.08 | 0.07 | 0.06 | 0.05 | 0.02 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 |
Cumulative Proportion | 0.49 | 0.59 | 0.68 | 0.76 | 0.84 | 0.89 | 0.95 | 0.97 | 0.98 | 0.99 | 0.99 | 1.00 | 1.00 |
Note that with 8 covarying variables the variance explained by the first PC is similar to that of PC1 from the real data
On ratio data:
as.data.frame(dimorphs$pcas$ratios$importance)
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Standard deviation | 0.04 | 0.02 | 0.02 | 0.02 | 0.02 | 0.02 | 0.02 | 0.01 | 0.01 | 0.01 | 0.01 | 0 | 0 |
Proportion of Variance | 0.37 | 0.13 | 0.10 | 0.09 | 0.08 | 0.07 | 0.06 | 0.04 | 0.03 | 0.01 | 0.01 | 0 | 0 |
Cumulative Proportion | 0.37 | 0.51 | 0.61 | 0.70 | 0.78 | 0.85 | 0.91 | 0.95 | 0.98 | 0.99 | 1.00 | 1 | 1 |
set.seed(54)
dimorphs <- sim_pca_dimorph(dimorphic_vars = 10, covar_vars = 8, varying_and_dimorphic = "random", effect_size = 3)
# as.data.frame(dimorphs$pcas$raw_vars$importance)
cols <- as.character(rep(dimorphs$dist_df$dimorphism, 2))
cols[cols == "monomorphic"] <- "gray"
cols[cols == "low dimorphism"] <- "orange"
cols[cols == "high dimorphism"] <- "red"
par(mfrow = c(5, 3))
# par(mfrow = c(2, 2))
for (i in 1:ncol(dimorphs$morph_dat))
# for (i in 1:4)
plot(c(dimorphs$morph_dat[1:(nrow(dimorphs$morph_dat) / 2), i]), c(dimorphs$morph_dat[((nrow(dimorphs$morph_dat) / 2) + 1):((nrow(dimorphs$morph_dat) / 2) * 2), i]), xlab = paste("Variable", i, "males"), ylab = paste("Variable", i, "females"), col = cols)
par(mfrow = c(1, 1))
On raw morphological data:
as.data.frame(dimorphs$pcas$raw_vars$importance)
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Standard deviation | 2.44 | 1.12 | 1.10 | 1.03 | 0.99 | 0.88 | 0.84 | 0.62 | 0.47 | 0.46 | 0.32 | 0.30 | 0.29 |
Proportion of Variance | 0.46 | 0.10 | 0.09 | 0.08 | 0.08 | 0.06 | 0.05 | 0.03 | 0.02 | 0.02 | 0.01 | 0.01 | 0.01 |
Cumulative Proportion | 0.46 | 0.55 | 0.65 | 0.73 | 0.80 | 0.86 | 0.92 | 0.95 | 0.96 | 0.98 | 0.99 | 0.99 | 1.00 |
On ratio data:
as.data.frame(dimorphs$pcas$ratios$importance)
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | PC11 | PC12 | PC13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Standard deviation | 0.11 | 0.06 | 0.05 | 0.04 | 0.04 | 0.04 | 0.03 | 0.03 | 0.03 | 0.01 | 0.01 | 0 | 0 |
Proportion of Variance | 0.47 | 0.13 | 0.09 | 0.08 | 0.07 | 0.05 | 0.04 | 0.03 | 0.02 | 0.01 | 0.00 | 0 | 0 |
Cumulative Proportion | 0.47 | 0.60 | 0.70 | 0.77 | 0.84 | 0.90 | 0.94 | 0.97 | 0.99 | 1.00 | 1.00 | 1 | 1 |
Simulations were run for all possible combinations of the following parameters:
A total of 162 different simulation, each one replicated 10000 times
grid <- expand.grid(dimorphic_vars = c(10, 3, 0),
covar_vars = c(10, 8, 3),
varying_and_dimorphic = c("yes", "no", "random"),
same_dimorphic = c(TRUE, FALSE),
effect_size = c(2, 3, 4))
out <- pblapply(1:nrow(grid), cl = 18, function(y){
reps <- repicate_models(model = call("sim_pca_dimorph", dimorphic_vars = grid$dimorphic_vars[y], covar_vars = grid$covar_vars[y], varying_and_dimorphic = grid$varying_and_dimorphic[y], same_dimorphic = grid$same_dimorphic[y], effect_size = grid$effect_size[y]), n = 10000)
reps$ps$dimorphic_vars <- grid$dimorphic_vars[y]
reps$ps$covar_vars <- grid$covar_vars[y]
reps$ps$varying_and_dimorphic = grid$varying_and_dimorphic[y]
reps$ps$same_dimorphic = if(grid$same_dimorphic[y]) "yes" else "no"
reps$ps$effect_size = if(grid$effect_size[y] == 2) "low" else if(grid$effect_size[y] == 3) "moderate" else "high"
return(reps)
})
saveRDS(out, "./output/10000_replicates_morphological_dimorphism.RDS")
The columns “p_regular”, “p_weight” and “p_ratio” show the proportion of replications that found a significant effect for the three dimorphism metrics
out <- readRDS("./output/10000_replicates_morphological_dimorphism.RDS")
ps <- lapply(out, "[[", 1)
ps <- do.call(rbind, ps)
datatable(ps[ps$dimorphic_vars == 10, ],
editable = list(target = 'row'),
rownames = FALSE, style = "bootstrap", filter = 'top',
options = list(pageLength = 100, autoWidth = TRUE, dom = 'ft'),
autoHideNavigation = TRUE, escape = FALSE)
datatable(ps[ps$dimorphic_vars == 3, ],
editable = list(target = 'row'),
rownames = FALSE, style = "bootstrap", filter = 'top',
options = list(pageLength = 100, autoWidth = TRUE, dom = 'ft'),
autoHideNavigation = TRUE, escape = FALSE)
datatable(ps[ps$dimorphic_vars == 0, ],
editable = list(target = 'row'),
rownames = FALSE, style = "bootstrap", filter = 'top',
options = list(pageLength = 100, autoWidth = TRUE, dom = 'ft'),
autoHideNavigation = TRUE, escape = FALSE)
Session information
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=pt_PT.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_CR.UTF-8 LC_COLLATE=pt_PT.UTF-8
## [5] LC_MONETARY=es_CR.UTF-8 LC_MESSAGES=pt_PT.UTF-8
## [7] LC_PAPER=es_CR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_CR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] DT_0.16 NormalGamma_1.1 histogram_0.0-25 optimx_2020-4.2
## [5] knitr_1.30 pbapply_1.4-3
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.27 R6_2.5.0 jsonlite_1.7.1
## [4] magrittr_2.0.1 evaluate_0.14 highr_0.8
## [7] rlang_0.4.10 stringi_1.5.3 rmarkdown_2.4
## [10] tools_4.0.3 stringr_1.4.0 htmlwidgets_1.5.2
## [13] crosstalk_1.1.0.1 numDeriv_2016.8-1.1 xfun_0.20
## [16] yaml_2.2.1 compiler_4.0.3 htmltools_0.5.0