1 Set Up & Objective

The purpose of this notebook is to take a look at how increasing the size of the training ensemble & tolerence can effect the size of ensemble that we can stitch together.

library(assertthat)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(kableExtra)

DIR <- here::here("..", "stitches", "krd_exp") 

theme_set(theme_bw())

list.files(DIR, "synthetic", full.names = TRUE) %>%  
  lapply(function(x){
    
   mod <- strsplit(basename(x), split = "_")[[1]][2]
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(-X) %>% 
      mutate(model = mod)
    
  }) %>%  
  dplyr::bind_rows() -> 
  stitched_data

list.files(DIR, "compdata", full.names = TRUE) %>%  
  lapply(function(x){
    out <- read.csv(x, stringsAsFactors = FALSE) 
    if(nrow(out)>1){
      return(out)
    }else{
      return(data.frame("X" = NA, 
                        "variable"=NA, 
                        "experiment"=NA, 
                        "ensemble"=NA, 
                        "model"=NA, 
                        "year"  =NA, 
                        "value" =NA, 
                        "zstore" =NA   ))
    }
  }) %>% 
  na.omit() %>% 
  bind_rows() %>% 
  select(variable, experiment, ensemble, model, year, value) %>% 
  distinct() -> 
  comp_data

2 Stitched Table

stitched_data %>%
  group_by(tol, experiment, ensemble_size, model) %>% 
  filter(year == 1850) %>% 
  summarise(n = n_distinct(value)) %>% 
  ungroup() -> 
  stitched_ensemble_counts
  
 stitched_ensemble_counts %>%  
  filter(ensemble_size != 1) %>% 
  spread(tol, n) %>% 
  arrange(experiment, ensemble_size) %>% 
   select(`Target Experiment` = experiment, `Training Ensemble Size` = ensemble_size, `Target Model` = model,  `0.07`,  `0.1`,
          `0.13`) %>% 
  kable() %>% 
   kable_styling()

Target Experiment	Training Ensemble Size	Target Model	0.07	0.1	0.13
ssp245	4	ACCESS-ESM1-5	7	9	11
ssp245	4	CanESM5	4	4	4
ssp245	4	FGOALS-g3	6	8	8
ssp245	4	GISS-E2-1-G	4	4	5
ssp245	4	MIROC-ES2L	5	7	8
ssp245	4	MPI-ESM1-2-LR	4	5	8
ssp245	5	CanESM5	5	5	5
ssp245	10	ACCESS-ESM1-5	15	19	24
ssp245	10	CanESM5	10	10	10
ssp245	10	MIROC-ES2L	12	13	19
ssp245	10	MPI-ESM1-2-LR	12	15	20
ssp245	20	CanESM5	20	20	20
ssp245	25	CanESM5	25	25	25
ssp370	4	ACCESS-ESM1-5	4	5	6
ssp370	4	CanESM5	4	4	4
ssp370	4	FGOALS-g3	5	5	7
ssp370	4	GISS-E2-1-G	4	4	6
ssp370	4	MIROC-ES2L	2	2	2
ssp370	4	MPI-ESM1-2-LR	4	4	7
ssp370	5	CanESM5	5	5	5
ssp370	10	ACCESS-ESM1-5	11	13	14
ssp370	10	CanESM5	10	10	10
ssp370	10	MIROC-ES2L	6	7	8
ssp370	10	MPI-ESM1-2-LR	11	14	18
ssp370	20	CanESM5	21	20	20
ssp370	25	CanESM5	25	25	25

3 Training Ensemble vs. Stitched Ensemble

stitched_ensemble_counts %>% 
  filter(tol == "0.07") %>% 
  ggplot() + 
  geom_abline(intercept = 0, slope = 1, color = "grey") + 
  geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) + 
  facet_wrap("tol", ncol = 1) + 
  theme_bw() + 
  labs(x = "Target Ensemble Size", 
       y = "Stitched Ensemble Size")

stitched_ensemble_counts %>% 
  filter(tol == "0.1") %>% 
  ggplot() + 
  geom_abline(intercept = 0, slope = 1, color = "grey") + 
  geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) + 
  facet_wrap("tol", ncol = 1) + 
  theme_bw() + 
  labs(x = "Target Ensemble Size", 
       y = "Stitched Ensemble Size")

stitched_ensemble_counts %>% 
  filter(tol == "0.13") %>% 
  ggplot() + 
  geom_abline(intercept = 0, slope = 1, color = "grey") + 
  geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) + 
  facet_wrap("tol", ncol = 1) + 
  theme_bw() + 
  labs(x = "Target Ensemble Size", 
       y = "Stitched Ensemble Size")

3.1 Stitched Results

# Plot stiched results vs tol for a single model & experiment 
# Note this a sloppy function, relies on data that exists in the global environment 
# Args 
#   model_name: a string of the model name to plot 
#   experiment_name: a string of the experiment name to plot 
plot_stitched_data <- function(model_name, experiment_name){
  
  stitched_data %>%  
    left_join(stitched_ensemble_counts, by = c("tol", "experiment", "ensemble_size", "model")) %>% 
    filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
    filter(model == model_name & experiment == experiment_name) %>% 
    mutate(ens_label = paste0("size out ", n)) -> 
    data_to_plot 
  
  data_to_plot %>% 
    select(tol, ensemble_size, ens_label) %>% 
    distinct() -> 
    dat_text
  
  ggplot() + 
    geom_line(data = data_to_plot, aes(year, value, groupby = stitching_id),  alpha = 0.4) + 
    geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label),  hjust = -0.05, vjust  = -0.1) +
    facet_grid(tol~ensemble_size) +
    labs(title = model_name, 
         subtitle = experiment_name, 
         y = "deg C")
  
}

3.1.1 SSP245

3.1.1.1 “ACCESS-ESM1-5”

plot_stitched_data("ACCESS-ESM1-5", "ssp245")

3.1.1.2 “CanESM5”

plot_stitched_data("CanESM5", "ssp245")

3.1.1.3 “MIROC-ES2L”

plot_stitched_data( "MIROC-ES2L", "ssp245")

3.1.1.4 “MPI-ESM1-2-LR”

plot_stitched_data("MPI-ESM1-2-LR", "ssp245")

3.1.2 ssp370

3.1.2.1 “ACCESS-ESM1-5”

plot_stitched_data("ACCESS-ESM1-5", "ssp370")

3.1.2.2 “CanESM5”

plot_stitched_data("CanESM5", "ssp370")

3.1.2.3 “MIROC-ES2L”

plot_stitched_data( "MIROC-ES2L", "ssp370")

3.1.2.4 “MPI-ESM1-2-LR”

plot_stitched_data("MPI-ESM1-2-LR", "ssp370")

4 Errors

# Calculate the jumps between each step in the data, it only works with a data frame that conntains
# data with a data from a single source. 
# 
# Args 
#   data: the data frame of values to aclcculate the difference in
get_jumps <- function(data){
  
  # Check the inputs! 
  assert_that(has_name(x = data, which = c("year", "value", "ensemble", "model")))
  assert_that(length(unique(data$year)) == length(data$year), msg = "mulitple entries for a single year")
  assert_that(all(diff(data$year) == 1))
  
  # Get the jumps 
  jumps <- diff(data$value)
  
  # Store the data in a data frame. 
  data %>% 
    select(model, ensemble, experiment) %>%  
    distinct() %>% 
    cbind(jumps = jumps) ->
    out
  
  return(out)
}

stitched_data %>%  
  filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
  filter(model != "ACCESS-ESM1-5") %>% 
  mutate(ensemble = paste0(stitching_id, "_", tol, "_", ensemble_size)) %>% 
  select(year, value, ensemble, model, experiment) %>% 
  distinct() %>% 
  split(., interaction(.$model, .$experiment, .$ensemble, drop = TRUE)) %>% 
  lapply(get_jumps) %>% 
  bind_rows -> 
  stitched_jumps

comp_data %>% 
  filter(model %in% stitched_jumps$model & experiment %in% stitched_jumps$experiment) %>% 
  split(., interaction(.$model, .$ensemble, .$experiment), drop = TRUE) %>% 
  lapply(get_jumps) %>% 
  bind_rows() -> 
  comp_jumps

# make the jump density plot 
plot_jump_density <- function(data, model_name){
  data %>%
    filter(model == model_name) %>% 
    ggplot() + 
    geom_density(aes(jumps, fill = source), alpha = 0.2) + 
    facet_wrap("experiment") + 
    labs(y = "Density of Jumps", x = NULL, title = model_name) -> 
    out 
  return(out)
}

stitched_jumps$source <- "stitched"

# lapply(stitched_jumps$ensemble, function(x){
#   unlist(strsplit(x, split = "_"))[2]
#   }) %>%
#   unlist ->
#   tol
# 
# lapply(stitched_jumps$ensemble, function(x){
#   unlist(strsplit(x, split = "_"))[3]
#   }) %>%
#   unlist ->
#   en_size
# 
# stitched_jumps$tol <- tol
# stitched_jumps$en_size <- en_size
# 
# stitched_jumps %>%
#   mutate(source = paste0(source, "_", tol, "_", en_size)) ->
#   stitched_jumps

comp_jumps$source <- "original"

jump_data <- bind_rows(stitched_jumps, comp_jumps)

plot_jump_density(data = jump_data, model_name = "MPI-ESM1-2-LR")

plot_jump_density(data = jump_data, model_name = "CanESM5")

plot_jump_density(data = jump_data, model_name = "MIROC-ES2L")

jump_data %>% 
  group_by(model, source, experiment) %>% 
  summarise(mean = mean(jumps), 
            sd = sd(jumps)) %>%  
  gather(mean, sd, key = "variable", value = "value") %>%  
  spread(source, value) %>% 
  mutate(diff = abs(original - stitched)/ original) %>% 
  select(model, experiment, variable, diff) %>% 
  spread(variable, diff) %>% 
  knitr::kable(caption = "jumps; abs(original - stitched)/original") %>% 
  kable_styling()

jumps; abs(original - stitched)/original
model	experiment	mean	sd
CanESM5	ssp245	0.0060603	0.0988185
CanESM5	ssp370	0.0102871	0.1071776
MIROC-ES2L	ssp245	0.0273818	0.0440580
MIROC-ES2L	ssp370	0.0345149	0.0121302
MPI-ESM1-2-LR	ssp245	0.0100443	0.0872434
MPI-ESM1-2-LR	ssp370	0.0040811	0.0730655

stitched_data %>%  
  filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
  filter(model != "ACCESS-ESM1-5") %>% 
  group_by(model, experiment) %>% 
  summarise(mean = mean(value), 
            sd = sd(value)) %>% 
  ungroup %>% 
  gather(mean, sd, key = "stat", value = "value") %>% 
  mutate(source = "stitched") -> 
  stitched_mean

comp_data %>% 
  filter(model %in% stitched_mean$model & experiment %in% stitched_mean$experiment) %>% 
  group_by(experiment, model) %>% 
  summarise(mean = mean(value), 
            sd = sd(value)) %>% 
    ungroup %>% 
  gather(mean, sd, key = "stat", value = "value") %>% 
  mutate(source = "original") -> 
  original_mean

bind_rows(stitched_mean, original_mean) %>% 
  spread(source, value) %>% 
  mutate(diff = abs(original - stitched) / original) %>% 
  select(model, experiment, diff, stat) %>% 
  spread(stat, diff) %>% 
  knitr::kable(caption = "mean data; abs(original - stitched)/original") %>% 
  kable_styling()

mean data; abs(original - stitched)/original
model	experiment	mean	sd
CanESM5	ssp245	0.1811518	0.0002302
CanESM5	ssp370	0.0196102	0.0010959
MIROC-ES2L	ssp245	0.0636297	0.0019242
MIROC-ES2L	ssp370	0.0944577	0.0046249
MPI-ESM1-2-LR	ssp245	-0.0510506	0.0028091
MPI-ESM1-2-LR	ssp370	0.2129963	0.0007331

5 Problems

So it looks like ACCESS-ESM1-5 has gaps in the data! Why is that, should that be legal or should that be illegal? Or shoudl we throw an error.

experiment_name <- "ssp245"
model_name <- "ACCESS-ESM1-5"
stitched_data %>%  
    left_join(stitched_ensemble_counts, by = c("tol", "experiment", "ensemble_size", "model")) %>% 
    filter(ensemble_size %in% c(5, 10, 20, 25)) %>% 
    filter(model == "ACCESS-ESM1-5" & experiment == "ssp245") %>% 
    mutate(ens_label = paste0("size out ", n)) -> 
    data_to_plot 
  
  data_to_plot %>% 
    select(tol, ensemble_size, ens_label) %>% 
    distinct() -> 
    dat_text
  
  ggplot() + 
    geom_line(data = data_to_plot, aes(year, value, groupby = stitching_id, color = stitching_id ),  alpha = 0.4) + 
    geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label),  hjust = -0.05, vjust  = -0.1) +
    facet_grid(tol~ensemble_size) +
    labs(title = model_name, 
         subtitle = experiment_name, 
         y = "deg C") + 
    theme(legend.position = "none")

data_to_plot %>%  
  filter(stitching_id == "ssp245~r19i1p1f1~1") %>% 
  
  ggplot() + 
    geom_point(aes(year, value, groupby = stitching_id, color = stitching_id ),  alpha = 0.4) + 
    geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label),  hjust = -0.05, vjust  = -0.1) +
    facet_grid(tol~ensemble_size) +
    labs(title = "ACCESS-ESM1-5", 
         subtitle = experiment_name, 
         y = "deg C") + 
    theme(legend.position = "none")

Ensemble Size Investigation

11 November, 2021