Half Violin Scatter BoxPlot

This code will create a half violin scatter box plot with four boxplots printed above it.

# Load libraries
library(tidyverse)
library(ggplot2)
library(ggpol)
# devtools::install_github(repo = "IndrajeetPatil/ggstatsplot")
library(ggstatsplot)
library(dplyr)

# Source the document with custom functions
source("src_masterfunctions_Supplement.R")

Here are the main functions used to create the half violin plot. This code was taken directly from https://github.com/tidyverse/ggplot2/issues/2459

# Half violin plot function:

"%||%" <- function(a, b) {
  if (!is.null(a))
    a
  else
    b
}

#=========================== function definition ===========================

geom_flat_violin <-
  function(mapping = NULL,
           data = NULL,
           stat = "ydensity",
           position = "dodge",
           trim = TRUE,
           scale = "area",
           show.legend = NA,
           inherit.aes = TRUE,
           ...) {
    ggplot2::layer(
      data = data,
      mapping = mapping,
      stat = stat,
      geom = GeomFlatViolin,
      position = position,
      show.legend = show.legend,
      inherit.aes = inherit.aes,
      params = list(trim = trim,
                    scale = scale,
                    ...)
    )
  }

GeomFlatViolin <-
  ggproto(
    "GeomFlatViolin",
    Geom,
    setup_data = function(data, params) {
      data$width <- data$width %||%
        params$width %||% (resolution(data$x, FALSE) * 0.9)
      
      # ymin, ymax, xmin, and xmax define the bounding rectangle for each group
      data %>%
        dplyr::group_by(.data = ., group) %>%
        dplyr::mutate(
          .data = .,
          ymin = min(y),
          ymax = max(y),
          xmin = x,
          xmax = x + width / 2
        )
    },
    
    draw_group = function(data, panel_scales, coord)
    {
      # Find the points for the line to go all the way around
      data <- base::transform(data,
                              xminv = x,
                              xmaxv = x + violinwidth * (xmax - x))
      
      # Make sure it's sorted properly to draw the outline
      newdata <-
        base::rbind(
          dplyr::arrange(.data = base::transform(data, x = xminv), y),
          dplyr::arrange(.data = base::transform(data, x = xmaxv), -y)
        )
      
      # Close the polygon: set first and last point the same
      # Needed for coord_polar and such
      newdata <- rbind(newdata, newdata[1,])
      
      ggplot2:::ggname("geom_flat_violin",
                       GeomPolygon$draw_panel(newdata, panel_scales, coord))
    },
    
    draw_key = draw_key_polygon,
    
    default_aes = ggplot2::aes(
      weight = 1,
      colour = "grey20",
      fill = "white",
      size = 0.5,
      alpha = NA,
      linetype = "solid"
    ),
    
    required_aes = c("x", "y")
  )

First, load and filter all the data.

# Read in the file with all the accuracies 
accuracy.df <- read.csv(list.files(pattern = "Accuracy_clean_w_WC_n_BIOCLIM_", full.names = T), as.is = T)

# We don't want to consider models that were developed in one park and then tested in another in this table,
# so get rid of those 
no.cross.df <- accuracy.df[accuracy.df$park == accuracy.df$model.derived.from, ]

# However, we do want to keep models that were derived from the allparks and then tested on the individual parks 
no.cross.df <- rbind(no.cross.df, 
                     accuracy.df[accuracy.df$model.derived.from == "AllParks" & accuracy.df$park != "AllParks", ])

# Quickly make a plot to show the distribution of scores for each sub-approach
nc.ordered <- no.cross.df[order(no.cross.df$VEcv), ]

# We'll remove maps that have accuracies below what appears to be a pretty clear
# inflection point in the results: -500%
cut.off <- -500

# Based on the estimated inflection point in the plot above, remove points less than -500
no.cross.df <- no.cross.df[no.cross.df$VEcv > -500, ]

# Make sure all the sub.approaches have a value (RF won't yet)
no.cross.df$sub.approach[is.na(no.cross.df$sub.approach)] <- no.cross.df$Approach[is.na(no.cross.df$sub.approach)]

# Change regressions and unmixing to abbreviations so they fit well on the plot
# Could put them at diagonals, but then would lose some space. 
no.cross.df$Approach[no.cross.df$Approach == "regression"] <- "REG"
no.cross.df$Approach[no.cross.df$Approach == "unmixing"] <- "UNMIX"

# Change some of the sub.approach names
no.cross.df$sub.approach[no.cross.df$sub.approach ==  "simple"] <- "Single \nVariable \nRegression"
no.cross.df$sub.approach[no.cross.df$sub.approach ==  "RNS"] <- "RNS \nRegression"
no.cross.df$sub.approach[no.cross.df$sub.approach ==  "STEP"] <- "Stepwise \nRegression"

# Approaches and sub.approaches need to be a factor
no.cross.df$Approach <- as.factor(no.cross.df$Approach)
no.cross.df$sub.approach <- as.factor(no.cross.df$sub.approach)

# Reorder the approach and sub.approach factors so the order is regressions then unmixing and
# that unmixing and regression approaches are grouped with themselves
no.cross.df$sub.approach <- factor(no.cross.df$sub.approach,
                                   levels(no.cross.df$sub.approach)[c(5, 7, 4, 6, 2, 1, 3, 8)])
no.cross.df$Approach <- factor(no.cross.df$Approach,levels(no.cross.df$Approach)[c(1, 3, 2, 4)])

Make four boxplots displaying the results using different accuracy metrics.

# First do the ANOVA
V.aov <- aov(no.cross.df$VEcv ~ no.cross.df$Approach)
E.aov <- aov(no.cross.df$E1 ~ no.cross.df$Approach)
R.aov <- aov(no.cross.df$r_sqr ~ no.cross.df$Approach)
RMSE.aov <- aov(no.cross.df$RMSE ~ no.cross.df$Approach)

# Then the Tukey Honestly Significant Difference post-hoc test
V.T <- TukeyHSD(V.aov)
E.T <- TukeyHSD(E.aov)
R.T <- TukeyHSD(R.aov)
RMSE.T <- TukeyHSD(RMSE.aov)

# Now get the compact letter display lettering (this is a function I wrote and can be found in
# the code sourced at the top of this script)
V.cld <- compactLetterDisplay(V.T, no.cross.df$VEcv, no.cross.df$Approach)
E.cld <- compactLetterDisplay(E.T, no.cross.df$E1, no.cross.df$Approach)
R.cld <- compactLetterDisplay(R.T, no.cross.df$r_sqr, no.cross.df$Approach)
RMSE.cld <- compactLetterDisplay(RMSE.T, no.cross.df$RMSE, no.cross.df$Approach, large.values.are.best = F)


# Make the Approach box plots
V.gg <- ggplot(no.cross.df, aes(x=Approach, y=VEcv)) + theme_classic() + 
  # labs(title=expression(bold("VEcv"))) +
  labs(y = "VEcv (%)") + 
  geom_hline(yintercept=0, linetype="dashed", color = "black") + 
  stat_boxplot(geom = "errorbar", width = 0.25) + 
  geom_boxplot(outlier.size = 0.1) + ylim(cut.off, 140) + 
  theme(plot.title = element_text(hjust = 0.5)) +
  annotate("text", x = 1:4, y = c(max(no.cross.df$VEcv[no.cross.df$Approach == "REG"]) + 30, 
                                  max(no.cross.df$VEcv[no.cross.df$Approach == "UNMIX"]) + 30,
                                  max(no.cross.df$VEcv[no.cross.df$Approach == "RF"]) + 30,
                                  max(no.cross.df$VEcv[no.cross.df$Approach == "VCF"]) + 30), 
           label = c("b", "c", "a", "b")) + 
  scale_y_continuous(breaks=c(-500, -400, -300, -200, -100, 0, 100))


E.gg <- ggplot(no.cross.df, aes(x=Approach, y=E1)) + theme_classic() + 
  # labs(title=expression(bold("E1"))) +
  labs(y = "E1 (%)") + 
  geom_hline(yintercept=0, linetype="dashed", color = "black") + 
  stat_boxplot(geom = "errorbar", width = 0.25) + 
  geom_boxplot(outlier.size = 0.1) + ylim(-300, 100) +
  theme(plot.title = element_text(hjust = 0.5)) + 
  annotate("text", x = 1:4, y = c(max(no.cross.df$E1[no.cross.df$Approach == "REG"]) + 20, 
                                  max(no.cross.df$E1[no.cross.df$Approach == "UNMIX"]) + 20,
                                  max(no.cross.df$E1[no.cross.df$Approach == "RF"]) + 20,
                                  max(no.cross.df$E1[no.cross.df$Approach == "VCF"]) + 20), 
           label = c("b", "c", "a", "bc"))

R.gg <- ggplot(no.cross.df, aes(x=Approach, y=r_sqr)) + theme_classic() + 
  # labs(title=expression(bold(paste("R"^"2")))) +
  labs(y = expression(paste("R"^"2"))) + 
  stat_boxplot(geom = "errorbar", width = 0.25) + 
  geom_boxplot(outlier.size = 0.1) + ylim(0, 1.05) + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  annotate("text", x = 1:4, y = c(max(no.cross.df$r_sqr[no.cross.df$Approach == "REG"]) + .055, 
                                  max(no.cross.df$r_sqr[no.cross.df$Approach == "UNMIX"]) + .055,
                                  max(no.cross.df$r_sqr[no.cross.df$Approach == "RF"]) + .055,
                                  max(no.cross.df$r_sqr[no.cross.df$Approach == "VCF"]) + .055), 
           label = c("b", "c", "a", "b"))

RMSE.gg <- ggplot(no.cross.df, aes(x=Approach, y=RMSE*100)) + theme_classic() + 
  # labs(title=expression(bold("RMSE"))) +
  labs(y = "RMSE (% WC)") +
  #geom_hline(yintercept=0, linetype="dashed", color = "black") + 
  stat_boxplot(geom = "errorbar", width = 0.25) + 
  geom_boxplot(outlier.size = 0.1) + ylim(100, -5) + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  annotate("text", x = 1:4, y = c(min(no.cross.df$RMSE[no.cross.df$Approach == "REG"])*100 - 5.5, 
                                  min(no.cross.df$RMSE[no.cross.df$Approach == "UNMIX"])*100 - 5.5,
                                  min(no.cross.df$RMSE[no.cross.df$Approach == "RF"])*100 - 5.5,
                                  min(no.cross.df$RMSE[no.cross.df$Approach == "VCF"])*100 - 5.5), 
           label = c("b", "c", "a", "b"))

Last, make the main plot and put them all the plots together!

# Create a new df for the sub-approach plot, which will only show scores over 0
sdf <- no.cross.df[no.cross.df$VEcv >= 0, ]

# Do the ANOVA based on the full dataset (values greater than -1000)
diff <- aov(no.cross.df$VEcv ~ no.cross.df$sub.approach)
diff.sub <- aov(sdf$VEcv ~ sdf$sub.approach)

# Then the Tukey Honestly Significant Difference post-hoc test
t.df <- TukeyHSD(diff)
t.sub <- TukeyHSD(diff.sub)

# Now get the compact letter display lettering
cld.tbl <- compactLetterDisplay(t.df, no.cross.df$VEcv, no.cross.df$sub.approach)
cld.sub <- compactLetterDisplay(t.sub, sdf$VEcv, sdf$sub.approach)


# Create the sub-approach plot
p5 <- ggplot(no.cross.df, aes(x=sub.approach, y=VEcv)) + theme_classic() + ylim(-500, 100) + 
  labs(x = "Sub-approach", y = "VEcv (%)", title = "") 

# Create a variable we'll use to adjust the position of the compact letter display lettering
letter.adjust <- 14

# Half violin + jitter + boxplot 
# https://github.com/tidyverse/ggplot2/issues/2459
half.violin.jitter.box <- p5 + geom_hline(yintercept=0, linetype="dashed", color = "black") + 
  geom_flat_violin(position = position_nudge(x = 0, y = 0), adjust=2, 
                   scale = "width", fill = "lightgray", colour = "gray") +
  geom_point(aes(x = as.numeric(sub.approach)-.27, y = VEcv, group = sub.approach), 
             position = position_jitter(width = .1), size = .25) + 
  stat_boxplot(position = position_nudge(x = 0, y = 0), geom = "errorbar", width = 0.15) + 
  geom_boxplot(aes(x = as.numeric(sub.approach)+0, y = VEcv, group = sub.approach), 
               outlier.shape = NA, alpha = 0.3, width = .2, colour = "black") +  
  annotate("text", x = 1:8, 
           y = c(max(sdf$VEcv[sdf$sub.approach == "Single \nVariable \nRegression"]) + letter.adjust, 
                                  max(sdf$VEcv[sdf$sub.approach == "Stepwise \nRegression"]) + letter.adjust,
                                  max(sdf$VEcv[sdf$sub.approach == "RNS \nRegression"]) + letter.adjust,
                                  max(sdf$VEcv[sdf$sub.approach == "SMA"]) + letter.adjust,
                                  max(sdf$VEcv[sdf$sub.approach == "MESMA"]) + letter.adjust,
                                  max(sdf$VEcv[sdf$sub.approach == "MCU"]) + letter.adjust,
                                  max(sdf$VEcv[sdf$sub.approach == "RF"]) + letter.adjust,
                                  max(sdf$VEcv[sdf$sub.approach == "VCF"]) + letter.adjust), 
           label = c("b", "b", "b", "d", "c", "d", "a", "bc")) + 
  scale_y_continuous(breaks=c(-500, -400, -300, -200, -100, 0, 100)) 


# Put the small boxplots on top and the big plot on the bottom
{
  jpeg("fig4_halfViolinJitterBox_topBottom.jpg", units = "in", width = 10, height = 8.5, 
       quality = 100, res = 1000)
  pushViewport(viewport(layout = grid.layout(15, 4)))
  vplayout <- function(x, y) viewport(layout.pos.row = x, layout.pos.col = y)
  print(V.gg, vp = vplayout(1:4, 1))
  print(E.gg, vp = vplayout(1:4, 2))
  print(RMSE.gg, vp = vplayout(1:4, 3))
  print(R.gg, vp = vplayout(1:4, 4))
  print(half.violin.jitter.box, vp = vplayout(5:15, 1:4))
  x1 <- .01
  x2 <- .26
  x3 <- .51
  x4 <- .76
  y1 <- .985
  y2 <- .7
  
  grid.text("(a)", x = unit(x1, "npc"), y = unit(y1, "npc"))
  grid.text("(b)", x = unit(x2, "npc"), y = unit(y1, "npc"))
  grid.text("(c)", x = unit(x3, "npc"), y = unit(y1, "npc"))
  grid.text("(d)", x = unit(x4, "npc"), y = unit(y1, "npc"))
  grid.text("(e)", x = unit(x1, "npc"), y = unit(y2, "npc"))
  
  dev.off()
}

## quartz_off_screen 
##                 2

Half Violin Scatter BoxPlot

Ryan Nagelkirk

3/7/2020

This code will create a half violin scatter box plot with four boxplots printed above it.