1 Set Up

The objective of this analysis is to take a look at what happens when n, where n is the ensembles increases and how does sn, the number of syntheic ensemble sizes change? We look at what happens when the tolerance is set at three different levels. For right now we only have looked at the results for CanESM5.

library(ggplot2)
library(dplyr)
library(knitr)

BASE_DIR <- "/Users/dorh012/projects/2021/cleanup_stitch/stitches/enriching_sample_size"

2 CanESM5

Looking at the ta

2.1 Tables of n vs sn

For n vs sn for CanESM5 with tol 0.07, 0.1, and 0.13 we saw a 1 to 1 realtionship regardless of the size of sn. The only time this wasn’t the case was when we had an obscene tolerance set to 0.5, the we saw a non 1 to 1 relationship between n ans sn (n * 3.2 = sn). Which I foud some what puzzeling…

2.1.1 Target experiments with held from the training archive

list.files(file.path(BASE_DIR, "outofsample_largetol"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) -> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  group_by(experiment, n = ensemble, tol) %>% 
  summarise(sn = n_distinct(stitching_id)) %>% 
  knitr::kable()

## `summarise()` has grouped output by 'experiment', 'n'. You can override using the `.groups` argument.

experiment	n	tol	sn
ssp245	5	0.07	5
ssp245	5	0.10	5
ssp245	5	0.13	5
ssp245	5	0.50	15
ssp245	10	0.07	10
ssp245	10	0.10	10
ssp245	10	0.13	10
ssp245	10	0.50	32
ssp245	15	0.07	15
ssp245	15	0.10	15
ssp245	15	0.13	15
ssp245	15	0.50	47
ssp245	25	0.07	25
ssp245	25	0.10	25
ssp245	25	0.13	25
ssp245	25	0.50	79
ssp370	5	0.07	5
ssp370	5	0.10	5
ssp370	5	0.13	5
ssp370	5	0.50	16
ssp370	10	0.07	9
ssp370	10	0.10	10
ssp370	10	0.13	10
ssp370	10	0.50	31
ssp370	15	0.07	15
ssp370	15	0.10	15
ssp370	15	0.13	15
ssp370	15	0.50	46
ssp370	25	0.07	26
ssp370	25	0.10	25
ssp370	25	0.13	25
ssp370	25	0.50	80

2.1.2 Target experiments included in the training archive

list.files(file.path(BASE_DIR, "in_sample"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) -> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  group_by(experiment, n = ensemble, tol) %>% 
  summarise(sn = n_distinct(stitching_id)) %>% 
  knitr::kable()

## `summarise()` has grouped output by 'experiment', 'n'. You can override using the `.groups` argument.

experiment	n	tol	sn
ssp245	5	0.07	5
ssp245	5	0.10	5
ssp245	5	0.13	5
ssp245	10	0.07	10
ssp245	10	0.10	10
ssp245	10	0.13	10
ssp245	15	0.07	16
ssp245	15	0.10	15
ssp245	15	0.13	15
ssp245	25	0.07	25
ssp245	25	0.10	25
ssp245	25	0.13	25
ssp370	5	0.07	5
ssp370	5	0.10	5
ssp370	5	0.13	5
ssp370	10	0.07	10
ssp370	10	0.10	10
ssp370	10	0.13	10
ssp370	15	0.07	15
ssp370	15	0.10	15
ssp370	15	0.13	15
ssp370	25	0.07	25
ssp370	25	0.10	25
ssp370	25	0.13	25

2.1.3 Target all 25 ensemble memebers

list.files(file.path(BASE_DIR, "unlimted_target_en"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) -> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  group_by(experiment, n = ensemble, tol) %>% 
  summarise(sn = n_distinct(stitching_id)) %>% 
  knitr::kable()

## `summarise()` has grouped output by 'experiment', 'n'. You can override using the `.groups` argument.

experiment	n	tol	sn
ssp245	5	0.07	5
ssp245	5	0.10	5
ssp245	5	0.13	5
ssp245	10	0.07	10
ssp245	10	0.10	10
ssp245	10	0.13	10
ssp245	15	0.07	15
ssp245	15	0.10	15
ssp245	15	0.13	15
ssp245	25	0.07	25
ssp245	25	0.10	25
ssp245	25	0.13	25
ssp370	5	0.07	5
ssp370	5	0.10	5
ssp370	5	0.13	5
ssp370	10	0.07	10
ssp370	10	0.10	10
ssp370	10	0.13	10
ssp370	15	0.07	15
ssp370	15	0.10	15
ssp370	15	0.13	15
ssp370	25	0.07	26
ssp370	25	0.10	25
ssp370	25	0.13	25

2.2 The Synthetic Stitched Results

list.files(file.path(BASE_DIR, "in_sample"), "synthetic", full.names = TRUE) %>%  
 lapply(function(x){
    
    exp <- ifelse(grepl(pattern = "ssp245", basename(x)), "ssp245", "ssp370")
    train_en <- as.integer(gsub(pattern = "_.*", x = basename(x), replacement = ""))
    
    read.csv(x, stringsAsFactors = FALSE) %>% 
      select(year, value, variable, stitching_id, tol) %>%  
      mutate(experiment = exp) %>% 
      mutate(ensemble = train_en) %>% 
      mutate(tol_en = paste0(tol, " (", train_en,  ")"))-> 
      data 
    
    return(data)
  }) %>% 
  bind_rows() -> 
  data

data %>% 
  filter(experiment == "ssp245") %>% 
  filter(year %in% 1900:2100) %>% 
  mutate(tol = as.character(tol)) %>% 
  ggplot() + 
  geom_line(aes(year, value, color = tol, groupby = stitching_id), alpha = 0.5) + 
  facet_grid(tol~ensemble) + 
  theme_bw() + 
  labs(title = "Ensemble size in vs. Tol", 
       y = "Deg C")

## Warning: Ignoring unknown aesthetics: groupby

data %>% 
  filter(experiment == "ssp245") %>% 
  filter(year %in% 1900:1905) %>% 
  mutate(tol = as.character(tol)) %>% 
  ggplot() + 
  geom_line(aes(year, value, color = tol, groupby = stitching_id), alpha = 0.5) + 
  facet_grid(~ensemble) + 
  theme_bw()+ 
  labs(title = "Ensemble size in vs. Tol 1900 - 1905", 
       y = "Deg C")

## Warning: Ignoring unknown aesthetics: groupby

3 Thoughts

Can’t decide if this is surprising or what we would expect
Does this depend on the ensemble memebers randomly seclected? If we were to do a monte carlo would the n=sn observation hold true?
Does this also hold true for other models? Or is a function of the tol which we have determied to be model dependent.

Enriching the Sample Size Experiment

28 October, 2021