# Load libraries
library(tidyverse)
library(ggplot2)
library(ggpol)
# devtools::install_github(repo = "IndrajeetPatil/ggstatsplot")
library(ggstatsplot)
library(dplyr)
# Source the document with custom functions
source("src_masterfunctions_Supplement.R")
Here are the main functions used to create the half violin plot. This code was taken directly from https://github.com/tidyverse/ggplot2/issues/2459
# Half violin plot function:
"%||%" <- function(a, b) {
if (!is.null(a))
a
else
b
}
#=========================== function definition ===========================
geom_flat_violin <-
function(mapping = NULL,
data = NULL,
stat = "ydensity",
position = "dodge",
trim = TRUE,
scale = "area",
show.legend = NA,
inherit.aes = TRUE,
...) {
ggplot2::layer(
data = data,
mapping = mapping,
stat = stat,
geom = GeomFlatViolin,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params = list(trim = trim,
scale = scale,
...)
)
}
GeomFlatViolin <-
ggproto(
"GeomFlatViolin",
Geom,
setup_data = function(data, params) {
data$width <- data$width %||%
params$width %||% (resolution(data$x, FALSE) * 0.9)
# ymin, ymax, xmin, and xmax define the bounding rectangle for each group
data %>%
dplyr::group_by(.data = ., group) %>%
dplyr::mutate(
.data = .,
ymin = min(y),
ymax = max(y),
xmin = x,
xmax = x + width / 2
)
},
draw_group = function(data, panel_scales, coord)
{
# Find the points for the line to go all the way around
data <- base::transform(data,
xminv = x,
xmaxv = x + violinwidth * (xmax - x))
# Make sure it's sorted properly to draw the outline
newdata <-
base::rbind(
dplyr::arrange(.data = base::transform(data, x = xminv), y),
dplyr::arrange(.data = base::transform(data, x = xmaxv), -y)
)
# Close the polygon: set first and last point the same
# Needed for coord_polar and such
newdata <- rbind(newdata, newdata[1,])
ggplot2:::ggname("geom_flat_violin",
GeomPolygon$draw_panel(newdata, panel_scales, coord))
},
draw_key = draw_key_polygon,
default_aes = ggplot2::aes(
weight = 1,
colour = "grey20",
fill = "white",
size = 0.5,
alpha = NA,
linetype = "solid"
),
required_aes = c("x", "y")
)
First, load and filter all the data.
# Read in the file with all the accuracies
accuracy.df <- read.csv(list.files(pattern = "Accuracy_clean_w_WC_n_BIOCLIM_", full.names = T), as.is = T)
# We don't want to consider models that were developed in one park and then tested in another in this table,
# so get rid of those
no.cross.df <- accuracy.df[accuracy.df$park == accuracy.df$model.derived.from, ]
# However, we do want to keep models that were derived from the allparks and then tested on the individual parks
no.cross.df <- rbind(no.cross.df,
accuracy.df[accuracy.df$model.derived.from == "AllParks" & accuracy.df$park != "AllParks", ])
# Quickly make a plot to show the distribution of scores for each sub-approach
nc.ordered <- no.cross.df[order(no.cross.df$VEcv), ]
# We'll remove maps that have accuracies below what appears to be a pretty clear
# inflection point in the results: -500%
cut.off <- -500
# Based on the estimated inflection point in the plot above, remove points less than -500
no.cross.df <- no.cross.df[no.cross.df$VEcv > -500, ]
# Make sure all the sub.approaches have a value (RF won't yet)
no.cross.df$sub.approach[is.na(no.cross.df$sub.approach)] <- no.cross.df$Approach[is.na(no.cross.df$sub.approach)]
# Change regressions and unmixing to abbreviations so they fit well on the plot
# Could put them at diagonals, but then would lose some space.
no.cross.df$Approach[no.cross.df$Approach == "regression"] <- "REG"
no.cross.df$Approach[no.cross.df$Approach == "unmixing"] <- "UNMIX"
# Change some of the sub.approach names
no.cross.df$sub.approach[no.cross.df$sub.approach == "simple"] <- "Single \nVariable \nRegression"
no.cross.df$sub.approach[no.cross.df$sub.approach == "RNS"] <- "RNS \nRegression"
no.cross.df$sub.approach[no.cross.df$sub.approach == "STEP"] <- "Stepwise \nRegression"
# Approaches and sub.approaches need to be a factor
no.cross.df$Approach <- as.factor(no.cross.df$Approach)
no.cross.df$sub.approach <- as.factor(no.cross.df$sub.approach)
# Reorder the approach and sub.approach factors so the order is regressions then unmixing and
# that unmixing and regression approaches are grouped with themselves
no.cross.df$sub.approach <- factor(no.cross.df$sub.approach,
levels(no.cross.df$sub.approach)[c(5, 7, 4, 6, 2, 1, 3, 8)])
no.cross.df$Approach <- factor(no.cross.df$Approach,levels(no.cross.df$Approach)[c(1, 3, 2, 4)])
Make four boxplots displaying the results using different accuracy metrics.
# First do the ANOVA
V.aov <- aov(no.cross.df$VEcv ~ no.cross.df$Approach)
E.aov <- aov(no.cross.df$E1 ~ no.cross.df$Approach)
R.aov <- aov(no.cross.df$r_sqr ~ no.cross.df$Approach)
RMSE.aov <- aov(no.cross.df$RMSE ~ no.cross.df$Approach)
# Then the Tukey Honestly Significant Difference post-hoc test
V.T <- TukeyHSD(V.aov)
E.T <- TukeyHSD(E.aov)
R.T <- TukeyHSD(R.aov)
RMSE.T <- TukeyHSD(RMSE.aov)
# Now get the compact letter display lettering (this is a function I wrote and can be found in
# the code sourced at the top of this script)
V.cld <- compactLetterDisplay(V.T, no.cross.df$VEcv, no.cross.df$Approach)
E.cld <- compactLetterDisplay(E.T, no.cross.df$E1, no.cross.df$Approach)
R.cld <- compactLetterDisplay(R.T, no.cross.df$r_sqr, no.cross.df$Approach)
RMSE.cld <- compactLetterDisplay(RMSE.T, no.cross.df$RMSE, no.cross.df$Approach, large.values.are.best = F)
# Make the Approach box plots
V.gg <- ggplot(no.cross.df, aes(x=Approach, y=VEcv)) + theme_classic() +
# labs(title=expression(bold("VEcv"))) +
labs(y = "VEcv (%)") +
geom_hline(yintercept=0, linetype="dashed", color = "black") +
stat_boxplot(geom = "errorbar", width = 0.25) +
geom_boxplot(outlier.size = 0.1) + ylim(cut.off, 140) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = 1:4, y = c(max(no.cross.df$VEcv[no.cross.df$Approach == "REG"]) + 30,
max(no.cross.df$VEcv[no.cross.df$Approach == "UNMIX"]) + 30,
max(no.cross.df$VEcv[no.cross.df$Approach == "RF"]) + 30,
max(no.cross.df$VEcv[no.cross.df$Approach == "VCF"]) + 30),
label = c("b", "c", "a", "b")) +
scale_y_continuous(breaks=c(-500, -400, -300, -200, -100, 0, 100))
E.gg <- ggplot(no.cross.df, aes(x=Approach, y=E1)) + theme_classic() +
# labs(title=expression(bold("E1"))) +
labs(y = "E1 (%)") +
geom_hline(yintercept=0, linetype="dashed", color = "black") +
stat_boxplot(geom = "errorbar", width = 0.25) +
geom_boxplot(outlier.size = 0.1) + ylim(-300, 100) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = 1:4, y = c(max(no.cross.df$E1[no.cross.df$Approach == "REG"]) + 20,
max(no.cross.df$E1[no.cross.df$Approach == "UNMIX"]) + 20,
max(no.cross.df$E1[no.cross.df$Approach == "RF"]) + 20,
max(no.cross.df$E1[no.cross.df$Approach == "VCF"]) + 20),
label = c("b", "c", "a", "bc"))
R.gg <- ggplot(no.cross.df, aes(x=Approach, y=r_sqr)) + theme_classic() +
# labs(title=expression(bold(paste("R"^"2")))) +
labs(y = expression(paste("R"^"2"))) +
stat_boxplot(geom = "errorbar", width = 0.25) +
geom_boxplot(outlier.size = 0.1) + ylim(0, 1.05) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = 1:4, y = c(max(no.cross.df$r_sqr[no.cross.df$Approach == "REG"]) + .055,
max(no.cross.df$r_sqr[no.cross.df$Approach == "UNMIX"]) + .055,
max(no.cross.df$r_sqr[no.cross.df$Approach == "RF"]) + .055,
max(no.cross.df$r_sqr[no.cross.df$Approach == "VCF"]) + .055),
label = c("b", "c", "a", "b"))
RMSE.gg <- ggplot(no.cross.df, aes(x=Approach, y=RMSE*100)) + theme_classic() +
# labs(title=expression(bold("RMSE"))) +
labs(y = "RMSE (% WC)") +
#geom_hline(yintercept=0, linetype="dashed", color = "black") +
stat_boxplot(geom = "errorbar", width = 0.25) +
geom_boxplot(outlier.size = 0.1) + ylim(100, -5) +
theme(plot.title = element_text(hjust = 0.5)) +
annotate("text", x = 1:4, y = c(min(no.cross.df$RMSE[no.cross.df$Approach == "REG"])*100 - 5.5,
min(no.cross.df$RMSE[no.cross.df$Approach == "UNMIX"])*100 - 5.5,
min(no.cross.df$RMSE[no.cross.df$Approach == "RF"])*100 - 5.5,
min(no.cross.df$RMSE[no.cross.df$Approach == "VCF"])*100 - 5.5),
label = c("b", "c", "a", "b"))
Last, make the main plot and put them all the plots together!
# Create a new df for the sub-approach plot, which will only show scores over 0
sdf <- no.cross.df[no.cross.df$VEcv >= 0, ]
# Do the ANOVA based on the full dataset (values greater than -1000)
diff <- aov(no.cross.df$VEcv ~ no.cross.df$sub.approach)
diff.sub <- aov(sdf$VEcv ~ sdf$sub.approach)
# Then the Tukey Honestly Significant Difference post-hoc test
t.df <- TukeyHSD(diff)
t.sub <- TukeyHSD(diff.sub)
# Now get the compact letter display lettering
cld.tbl <- compactLetterDisplay(t.df, no.cross.df$VEcv, no.cross.df$sub.approach)
cld.sub <- compactLetterDisplay(t.sub, sdf$VEcv, sdf$sub.approach)
# Create the sub-approach plot
p5 <- ggplot(no.cross.df, aes(x=sub.approach, y=VEcv)) + theme_classic() + ylim(-500, 100) +
labs(x = "Sub-approach", y = "VEcv (%)", title = "")
# Create a variable we'll use to adjust the position of the compact letter display lettering
letter.adjust <- 14
# Half violin + jitter + boxplot
# https://github.com/tidyverse/ggplot2/issues/2459
half.violin.jitter.box <- p5 + geom_hline(yintercept=0, linetype="dashed", color = "black") +
geom_flat_violin(position = position_nudge(x = 0, y = 0), adjust=2,
scale = "width", fill = "lightgray", colour = "gray") +
geom_point(aes(x = as.numeric(sub.approach)-.27, y = VEcv, group = sub.approach),
position = position_jitter(width = .1), size = .25) +
stat_boxplot(position = position_nudge(x = 0, y = 0), geom = "errorbar", width = 0.15) +
geom_boxplot(aes(x = as.numeric(sub.approach)+0, y = VEcv, group = sub.approach),
outlier.shape = NA, alpha = 0.3, width = .2, colour = "black") +
annotate("text", x = 1:8,
y = c(max(sdf$VEcv[sdf$sub.approach == "Single \nVariable \nRegression"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "Stepwise \nRegression"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "RNS \nRegression"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "SMA"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "MESMA"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "MCU"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "RF"]) + letter.adjust,
max(sdf$VEcv[sdf$sub.approach == "VCF"]) + letter.adjust),
label = c("b", "b", "b", "d", "c", "d", "a", "bc")) +
scale_y_continuous(breaks=c(-500, -400, -300, -200, -100, 0, 100))
# Put the small boxplots on top and the big plot on the bottom
{
jpeg("fig4_halfViolinJitterBox_topBottom.jpg", units = "in", width = 10, height = 8.5,
quality = 100, res = 1000)
pushViewport(viewport(layout = grid.layout(15, 4)))
vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
print(V.gg, vp = vplayout(1:4, 1))
print(E.gg, vp = vplayout(1:4, 2))
print(RMSE.gg, vp = vplayout(1:4, 3))
print(R.gg, vp = vplayout(1:4, 4))
print(half.violin.jitter.box, vp = vplayout(5:15, 1:4))
x1 <- .01
x2 <- .26
x3 <- .51
x4 <- .76
y1 <- .985
y2 <- .7
grid.text("(a)", x = unit(x1, "npc"), y = unit(y1, "npc"))
grid.text("(b)", x = unit(x2, "npc"), y = unit(y1, "npc"))
grid.text("(c)", x = unit(x3, "npc"), y = unit(y1, "npc"))
grid.text("(d)", x = unit(x4, "npc"), y = unit(y1, "npc"))
grid.text("(e)", x = unit(x1, "npc"), y = unit(y2, "npc"))
dev.off()
}
## quartz_off_screen
## 2