1 Set Up & Objective

The purpose of this notebook is to take a look at how increasing the size of the training ensemble & tolerence can effect the size of ensemble that we can stitch together.

library(assertthat)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(kableExtra)

DIR <- here::here("..", "stitches", "krd_exp") 

theme_set(theme_bw())
list.files(DIR, "synthetic", full.names = TRUE) %>%  
  lapply(function(x){
    
   mod <- strsplit(basename(x), split = "_")[[1]][2]
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(-X) %>% 
      mutate(model = mod)
    
  }) %>%  
  dplyr::bind_rows() -> 
  stitched_data
list.files(DIR, "compdata", full.names = TRUE) %>%  
  lapply(function(x){
    out <- read.csv(x, stringsAsFactors = FALSE) 
    if(nrow(out)>1){
      return(out)
    }else{
      return(data.frame("X" = NA, 
                        "variable"=NA, 
                        "experiment"=NA, 
                        "ensemble"=NA, 
                        "model"=NA, 
                        "year"  =NA, 
                        "value" =NA, 
                        "zstore" =NA   ))
    }
  }) %>% 
  na.omit() %>% 
  bind_rows() %>% 
  select(variable, experiment, ensemble, model, year, value) %>% 
  distinct() -> 
  comp_data

2 Stitched Table

stitched_data %>%
  group_by(tol, experiment, ensemble_size, model) %>% 
  filter(year == 1850) %>% 
  summarise(n = n_distinct(value)) %>% 
  ungroup() -> 
  stitched_ensemble_counts
  
 stitched_ensemble_counts %>%  
  filter(ensemble_size != 1) %>% 
  spread(tol, n) %>% 
  arrange(experiment, ensemble_size) %>% 
   select(`Target Experiment` = experiment, `Training Ensemble Size` = ensemble_size, `Target Model` = model,  `0.07`,  `0.1`,
          `0.13`) %>% 
  kable() %>% 
   kable_styling()
Target Experiment Training Ensemble Size Target Model 0.07 0.1 0.13
ssp245 4 ACCESS-ESM1-5 7 9 11
ssp245 4 CanESM5 4 4 4
ssp245 4 FGOALS-g3 6 8 8
ssp245 4 GISS-E2-1-G 4 4 5
ssp245 4 MIROC-ES2L 5 7 8
ssp245 4 MPI-ESM1-2-LR 4 5 8
ssp245 5 CanESM5 5 5 5
ssp245 10 ACCESS-ESM1-5 15 19 24
ssp245 10 CanESM5 10 10 10
ssp245 10 MIROC-ES2L 12 13 19
ssp245 10 MPI-ESM1-2-LR 12 15 20
ssp245 20 CanESM5 20 20 20
ssp245 25 CanESM5 25 25 25
ssp370 4 ACCESS-ESM1-5 4 5 6
ssp370 4 CanESM5 4 4 4
ssp370 4 FGOALS-g3 5 5 7
ssp370 4 GISS-E2-1-G 4 4 6
ssp370 4 MIROC-ES2L 2 2 2
ssp370 4 MPI-ESM1-2-LR 4 4 7
ssp370 5 CanESM5 5 5 5
ssp370 10 ACCESS-ESM1-5 11 13 14
ssp370 10 CanESM5 10 10 10
ssp370 10 MIROC-ES2L 6 7 8
ssp370 10 MPI-ESM1-2-LR 11 14 18
ssp370 20 CanESM5 21 20 20
ssp370 25 CanESM5 25 25 25

3 Training Ensemble vs. Stitched Ensemble

stitched_ensemble_counts %>% 
  filter(tol == "0.07") %>% 
  ggplot() + 
  geom_abline(intercept = 0, slope = 1, color = "grey") + 
  geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) + 
  facet_wrap("tol", ncol = 1) + 
  theme_bw() + 
  labs(x = "Target Ensemble Size", 
       y = "Stitched Ensemble Size") 

stitched_ensemble_counts %>% 
  filter(tol == "0.1") %>% 
  ggplot() + 
  geom_abline(intercept = 0, slope = 1, color = "grey") + 
  geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) + 
  facet_wrap("tol", ncol = 1) + 
  theme_bw() + 
  labs(x = "Target Ensemble Size", 
       y = "Stitched Ensemble Size") 

stitched_ensemble_counts %>% 
  filter(tol == "0.13") %>% 
  ggplot() + 
  geom_abline(intercept = 0, slope = 1, color = "grey") + 
  geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) + 
  facet_wrap("tol", ncol = 1) + 
  theme_bw() + 
  labs(x = "Target Ensemble Size", 
       y = "Stitched Ensemble Size") 

3.1 Stitched Results

# Plot stiched results vs tol for a single model & experiment 
# Note this a sloppy function, relies on data that exists in the global environment 
# Args 
#   model_name: a string of the model name to plot 
#   experiment_name: a string of the experiment name to plot 
plot_stitched_data <- function(model_name, experiment_name){
  
  stitched_data %>%  
    left_join(stitched_ensemble_counts, by = c("tol", "experiment", "ensemble_size", "model")) %>% 
    filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
    filter(model == model_name & experiment == experiment_name) %>% 
    mutate(ens_label = paste0("size out ", n)) -> 
    data_to_plot 
  
  data_to_plot %>% 
    select(tol, ensemble_size, ens_label) %>% 
    distinct() -> 
    dat_text
  
  ggplot() + 
    geom_line(data = data_to_plot, aes(year, value, groupby = stitching_id),  alpha = 0.4) + 
    geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label),  hjust = -0.05, vjust  = -0.1) +
    facet_grid(tol~ensemble_size) +
    labs(title = model_name, 
         subtitle = experiment_name, 
         y = "deg C")
  
}

3.1.1 SSP245

3.1.1.1 “ACCESS-ESM1-5”

plot_stitched_data("ACCESS-ESM1-5", "ssp245")

3.1.1.2 “CanESM5”

plot_stitched_data("CanESM5", "ssp245")

3.1.1.3 “MIROC-ES2L”

plot_stitched_data( "MIROC-ES2L", "ssp245")

3.1.1.4 “MPI-ESM1-2-LR”

plot_stitched_data("MPI-ESM1-2-LR", "ssp245")

3.1.2 ssp370

3.1.2.1 “ACCESS-ESM1-5”

plot_stitched_data("ACCESS-ESM1-5", "ssp370")

3.1.2.2 “CanESM5”

plot_stitched_data("CanESM5", "ssp370")

3.1.2.3 “MIROC-ES2L”

plot_stitched_data( "MIROC-ES2L", "ssp370")

3.1.2.4 “MPI-ESM1-2-LR”

plot_stitched_data("MPI-ESM1-2-LR", "ssp370")

4 Errors

# Calculate the jumps between each step in the data, it only works with a data frame that conntains
# data with a data from a single source. 
# 
# Args 
#   data: the data frame of values to aclcculate the difference in
get_jumps <- function(data){
  
  # Check the inputs! 
  assert_that(has_name(x = data, which = c("year", "value", "ensemble", "model")))
  assert_that(length(unique(data$year)) == length(data$year), msg = "mulitple entries for a single year")
  assert_that(all(diff(data$year) == 1))
  
  # Get the jumps 
  jumps <- diff(data$value)
  
  # Store the data in a data frame. 
  data %>% 
    select(model, ensemble, experiment) %>%  
    distinct() %>% 
    cbind(jumps = jumps) ->
    out
  
  return(out)
}
stitched_data %>%  
  filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
  filter(model != "ACCESS-ESM1-5") %>% 
  mutate(ensemble = paste0(stitching_id, "_", tol, "_", ensemble_size)) %>% 
  select(year, value, ensemble, model, experiment) %>% 
  distinct() %>% 
  split(., interaction(.$model, .$experiment, .$ensemble, drop = TRUE)) %>% 
  lapply(get_jumps) %>% 
  bind_rows -> 
  stitched_jumps
comp_data %>% 
  filter(model %in% stitched_jumps$model & experiment %in% stitched_jumps$experiment) %>% 
  split(., interaction(.$model, .$ensemble, .$experiment), drop = TRUE) %>% 
  lapply(get_jumps) %>% 
  bind_rows() -> 
  comp_jumps
# make the jump density plot 
plot_jump_density <- function(data, model_name){
  data %>%
    filter(model == model_name) %>% 
    ggplot() + 
    geom_density(aes(jumps, fill = source), alpha = 0.2) + 
    facet_wrap("experiment") + 
    labs(y = "Density of Jumps", x = NULL, title = model_name) -> 
    out 
  return(out)
}
stitched_jumps$source <- "stitched"

# lapply(stitched_jumps$ensemble, function(x){
#   unlist(strsplit(x, split = "_"))[2]
#   }) %>%
#   unlist ->
#   tol
# 
# lapply(stitched_jumps$ensemble, function(x){
#   unlist(strsplit(x, split = "_"))[3]
#   }) %>%
#   unlist ->
#   en_size
# 
# stitched_jumps$tol <- tol
# stitched_jumps$en_size <- en_size
# 
# stitched_jumps %>%
#   mutate(source = paste0(source, "_", tol, "_", en_size)) ->
#   stitched_jumps

comp_jumps$source <- "original"

jump_data <- bind_rows(stitched_jumps, comp_jumps)
plot_jump_density(data = jump_data, model_name = "MPI-ESM1-2-LR")

plot_jump_density(data = jump_data, model_name = "CanESM5")

plot_jump_density(data = jump_data, model_name = "MIROC-ES2L")

jump_data %>% 
  group_by(model, source, experiment) %>% 
  summarise(mean = mean(jumps), 
            sd = sd(jumps)) %>%  
  gather(mean, sd, key = "variable", value = "value") %>%  
  spread(source, value) %>% 
  mutate(diff = abs(original - stitched)/ original) %>% 
  select(model, experiment, variable, diff) %>% 
  spread(variable, diff) %>% 
  knitr::kable(caption = "jumps; abs(original - stitched)/original") %>% 
  kable_styling()
jumps; abs(original - stitched)/original
model experiment mean sd
CanESM5 ssp245 0.0060603 0.0988185
CanESM5 ssp370 0.0102871 0.1071776
MIROC-ES2L ssp245 0.0273818 0.0440580
MIROC-ES2L ssp370 0.0345149 0.0121302
MPI-ESM1-2-LR ssp245 0.0100443 0.0872434
MPI-ESM1-2-LR ssp370 0.0040811 0.0730655
stitched_data %>%  
  filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
  filter(model != "ACCESS-ESM1-5") %>% 
  group_by(model, experiment) %>% 
  summarise(mean = mean(value), 
            sd = sd(value)) %>% 
  ungroup %>% 
  gather(mean, sd, key = "stat", value = "value") %>% 
  mutate(source = "stitched") -> 
  stitched_mean

comp_data %>% 
  filter(model %in% stitched_mean$model & experiment %in% stitched_mean$experiment) %>% 
  group_by(experiment, model) %>% 
  summarise(mean = mean(value), 
            sd = sd(value)) %>% 
    ungroup %>% 
  gather(mean, sd, key = "stat", value = "value") %>% 
  mutate(source = "original") -> 
  original_mean

bind_rows(stitched_mean, original_mean) %>% 
  spread(source, value) %>% 
  mutate(diff = abs(original - stitched) / original) %>% 
  select(model, experiment, diff, stat) %>% 
  spread(stat, diff) %>% 
  knitr::kable(caption = "mean data; abs(original - stitched)/original") %>% 
  kable_styling()
mean data; abs(original - stitched)/original
model experiment mean sd
CanESM5 ssp245 0.1811518 0.0002302
CanESM5 ssp370 0.0196102 0.0010959
MIROC-ES2L ssp245 0.0636297 0.0019242
MIROC-ES2L ssp370 0.0944577 0.0046249
MPI-ESM1-2-LR ssp245 -0.0510506 0.0028091
MPI-ESM1-2-LR ssp370 0.2129963 0.0007331

5 Problems

So it looks like ACCESS-ESM1-5 has gaps in the data! Why is that, should that be legal or should that be illegal? Or shoudl we throw an error.

experiment_name <- "ssp245"
model_name <- "ACCESS-ESM1-5"
stitched_data %>%  
    left_join(stitched_ensemble_counts, by = c("tol", "experiment", "ensemble_size", "model")) %>% 
    filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
    filter(model == "ACCESS-ESM1-5" & experiment == "ssp245") %>% 
    mutate(ens_label = paste0("size out ", n)) -> 
    data_to_plot 
  
  data_to_plot %>% 
    select(tol, ensemble_size, ens_label) %>% 
    distinct() -> 
    dat_text
  
  ggplot() + 
    geom_line(data = data_to_plot, aes(year, value, groupby = stitching_id, color = stitching_id ),  alpha = 0.4) + 
    geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label),  hjust = -0.05, vjust  = -0.1) +
    facet_grid(tol~ensemble_size) +
    labs(title = model_name, 
         subtitle = experiment_name, 
         y = "deg C") + 
    theme(legend.position = "none")

data_to_plot %>%  
  filter(stitching_id == "ssp245~r19i1p1f1~1") %>% 
  
  ggplot() + 
    geom_point(aes(year, value, groupby = stitching_id, color = stitching_id ),  alpha = 0.4) + 
    geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label),  hjust = -0.05, vjust  = -0.1) +
    facet_grid(tol~ensemble_size) +
    labs(title = "ACCESS-ESM1-5", 
         subtitle = experiment_name, 
         y = "deg C") + 
    theme(legend.position = "none")