1 Set Up

The objective of this analysis is to take a look at what happens when n, where n is the ensembles increases and how does sn, the number of syntheic ensemble sizes change? We look at what happens when the tolerance is set at three different levels. For right now we only have looked at the results for CanESM5.

library(ggplot2)
library(dplyr)
library(knitr)

BASE_DIR <- "/Users/dorh012/projects/2021/cleanup_stitch/stitches/enriching_sample_size"

2 CanESM5

Looking at the ta

2.1 Tables of n vs sn

For n vs sn for CanESM5 with tol 0.07, 0.1, and 0.13 we saw a 1 to 1 realtionship regardless of the size of sn. The only time this wasn’t the case was when we had an obscene tolerance set to 0.5, the we saw a non 1 to 1 relationship between n ans sn (n * 3.2 = sn). Which I foud some what puzzeling…

2.1.1 Target experiments with held from the training archive

list.files(file.path(BASE_DIR, "outofsample_largetol"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) -> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  group_by(experiment, n = ensemble, tol) %>% 
  summarise(sn = n_distinct(stitching_id)) %>% 
  knitr::kable()
## `summarise()` has grouped output by 'experiment', 'n'. You can override using the `.groups` argument.
experiment n tol sn
ssp245 5 0.07 5
ssp245 5 0.10 5
ssp245 5 0.13 5
ssp245 5 0.50 15
ssp245 10 0.07 10
ssp245 10 0.10 10
ssp245 10 0.13 10
ssp245 10 0.50 32
ssp245 15 0.07 15
ssp245 15 0.10 15
ssp245 15 0.13 15
ssp245 15 0.50 47
ssp245 25 0.07 25
ssp245 25 0.10 25
ssp245 25 0.13 25
ssp245 25 0.50 79
ssp370 5 0.07 5
ssp370 5 0.10 5
ssp370 5 0.13 5
ssp370 5 0.50 16
ssp370 10 0.07 9
ssp370 10 0.10 10
ssp370 10 0.13 10
ssp370 10 0.50 31
ssp370 15 0.07 15
ssp370 15 0.10 15
ssp370 15 0.13 15
ssp370 15 0.50 46
ssp370 25 0.07 26
ssp370 25 0.10 25
ssp370 25 0.13 25
ssp370 25 0.50 80

2.1.2 Target experiments included in the training archive

list.files(file.path(BASE_DIR, "in_sample"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) -> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  group_by(experiment, n = ensemble, tol) %>% 
  summarise(sn = n_distinct(stitching_id)) %>% 
  knitr::kable()
## `summarise()` has grouped output by 'experiment', 'n'. You can override using the `.groups` argument.
experiment n tol sn
ssp245 5 0.07 5
ssp245 5 0.10 5
ssp245 5 0.13 5
ssp245 10 0.07 10
ssp245 10 0.10 10
ssp245 10 0.13 10
ssp245 15 0.07 16
ssp245 15 0.10 15
ssp245 15 0.13 15
ssp245 25 0.07 25
ssp245 25 0.10 25
ssp245 25 0.13 25
ssp370 5 0.07 5
ssp370 5 0.10 5
ssp370 5 0.13 5
ssp370 10 0.07 10
ssp370 10 0.10 10
ssp370 10 0.13 10
ssp370 15 0.07 15
ssp370 15 0.10 15
ssp370 15 0.13 15
ssp370 25 0.07 25
ssp370 25 0.10 25
ssp370 25 0.13 25

2.1.3 Target all 25 ensemble memebers

list.files(file.path(BASE_DIR, "unlimted_target_en"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) -> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  group_by(experiment, n = ensemble, tol) %>% 
  summarise(sn = n_distinct(stitching_id)) %>% 
  knitr::kable()
## `summarise()` has grouped output by 'experiment', 'n'. You can override using the `.groups` argument.
experiment n tol sn
ssp245 5 0.07 5
ssp245 5 0.10 5
ssp245 5 0.13 5
ssp245 10 0.07 10
ssp245 10 0.10 10
ssp245 10 0.13 10
ssp245 15 0.07 15
ssp245 15 0.10 15
ssp245 15 0.13 15
ssp245 25 0.07 25
ssp245 25 0.10 25
ssp245 25 0.13 25
ssp370 5 0.07 5
ssp370 5 0.10 5
ssp370 5 0.13 5
ssp370 10 0.07 10
ssp370 10 0.10 10
ssp370 10 0.13 10
ssp370 15 0.07 15
ssp370 15 0.10 15
ssp370 15 0.13 15
ssp370 25 0.07 26
ssp370 25 0.10 25
ssp370 25 0.13 25

2.2 The Synthetic Stitched Results

list.files(file.path(BASE_DIR, "in_sample"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) %>% 
      mutate(tol_en = paste0(tol, " (", train_en,  ")"))-> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data
data %>% 
  filter(experiment == "ssp245") %>% 
  filter(year %in% 1900:2100) %>% 
  mutate(tol = as.character(tol)) %>% 
  ggplot() + 
  geom_line(aes(year, value, color = tol, groupby = stitching_id), alpha = 0.5) + 
  facet_grid(tol~ensemble) + 
  theme_bw() + 
  labs(title = "Ensemble size in vs. Tol", 
       y = "Deg C")
## Warning: Ignoring unknown aesthetics: groupby

data %>% 
  filter(experiment == "ssp245") %>% 
  filter(year %in% 1900:1905) %>% 
  mutate(tol = as.character(tol)) %>% 
  ggplot() + 
  geom_line(aes(year, value, color = tol, groupby = stitching_id), alpha = 0.5) + 
  facet_grid(~ensemble) + 
  theme_bw()+ 
  labs(title = "Ensemble size in vs. Tol 1900 - 1905", 
       y = "Deg C")
## Warning: Ignoring unknown aesthetics: groupby

3 Thoughts

  • Can’t decide if this is surprising or what we would expect
  • Does this depend on the ensemble memebers randomly seclected? If we were to do a monte carlo would the n=sn observation hold true?
  • Does this also hold true for other models? Or is a function of the tol which we have determied to be model dependent.