Set Up & Objective
The purpose of this notebook is to take a look at how increasing the size of the training ensemble & tolerence can effect the size of ensemble that we can stitch together.
library(assertthat)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
library(kableExtra)
DIR <- here::here("..", "stitches", "krd_exp")
theme_set(theme_bw())
list.files(DIR, "synthetic", full.names = TRUE) %>%
lapply(function(x){
mod <- strsplit(basename(x), split = "_")[[1]][2]
read.csv(x, stringsAsFactors = FALSE) %>%
select(-X) %>%
mutate(model = mod)
}) %>%
dplyr::bind_rows() ->
stitched_data
list.files(DIR, "compdata", full.names = TRUE) %>%
lapply(function(x){
out <- read.csv(x, stringsAsFactors = FALSE)
if(nrow(out)>1){
return(out)
}else{
return(data.frame("X" = NA,
"variable"=NA,
"experiment"=NA,
"ensemble"=NA,
"model"=NA,
"year" =NA,
"value" =NA,
"zstore" =NA ))
}
}) %>%
na.omit() %>%
bind_rows() %>%
select(variable, experiment, ensemble, model, year, value) %>%
distinct() ->
comp_data
Stitched Table
stitched_data %>%
group_by(tol, experiment, ensemble_size, model) %>%
filter(year == 1850) %>%
summarise(n = n_distinct(value)) %>%
ungroup() ->
stitched_ensemble_counts
stitched_ensemble_counts %>%
filter(ensemble_size != 1) %>%
spread(tol, n) %>%
arrange(experiment, ensemble_size) %>%
select(`Target Experiment` = experiment, `Training Ensemble Size` = ensemble_size, `Target Model` = model, `0.07`, `0.1`,
`0.13`) %>%
kable() %>%
kable_styling()
|
Target Experiment
|
Training Ensemble Size
|
Target Model
|
0.07
|
0.1
|
0.13
|
|
ssp245
|
4
|
ACCESS-ESM1-5
|
7
|
9
|
11
|
|
ssp245
|
4
|
CanESM5
|
4
|
4
|
4
|
|
ssp245
|
4
|
FGOALS-g3
|
6
|
8
|
8
|
|
ssp245
|
4
|
GISS-E2-1-G
|
4
|
4
|
5
|
|
ssp245
|
4
|
MIROC-ES2L
|
5
|
7
|
8
|
|
ssp245
|
4
|
MPI-ESM1-2-LR
|
4
|
5
|
8
|
|
ssp245
|
5
|
CanESM5
|
5
|
5
|
5
|
|
ssp245
|
10
|
ACCESS-ESM1-5
|
15
|
19
|
24
|
|
ssp245
|
10
|
CanESM5
|
10
|
10
|
10
|
|
ssp245
|
10
|
MIROC-ES2L
|
12
|
13
|
19
|
|
ssp245
|
10
|
MPI-ESM1-2-LR
|
12
|
15
|
20
|
|
ssp245
|
20
|
CanESM5
|
20
|
20
|
20
|
|
ssp245
|
25
|
CanESM5
|
25
|
25
|
25
|
|
ssp370
|
4
|
ACCESS-ESM1-5
|
4
|
5
|
6
|
|
ssp370
|
4
|
CanESM5
|
4
|
4
|
4
|
|
ssp370
|
4
|
FGOALS-g3
|
5
|
5
|
7
|
|
ssp370
|
4
|
GISS-E2-1-G
|
4
|
4
|
6
|
|
ssp370
|
4
|
MIROC-ES2L
|
2
|
2
|
2
|
|
ssp370
|
4
|
MPI-ESM1-2-LR
|
4
|
4
|
7
|
|
ssp370
|
5
|
CanESM5
|
5
|
5
|
5
|
|
ssp370
|
10
|
ACCESS-ESM1-5
|
11
|
13
|
14
|
|
ssp370
|
10
|
CanESM5
|
10
|
10
|
10
|
|
ssp370
|
10
|
MIROC-ES2L
|
6
|
7
|
8
|
|
ssp370
|
10
|
MPI-ESM1-2-LR
|
11
|
14
|
18
|
|
ssp370
|
20
|
CanESM5
|
21
|
20
|
20
|
|
ssp370
|
25
|
CanESM5
|
25
|
25
|
25
|
Training Ensemble vs. Stitched Ensemble
stitched_ensemble_counts %>%
filter(tol == "0.07") %>%
ggplot() +
geom_abline(intercept = 0, slope = 1, color = "grey") +
geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) +
facet_wrap("tol", ncol = 1) +
theme_bw() +
labs(x = "Target Ensemble Size",
y = "Stitched Ensemble Size")

stitched_ensemble_counts %>%
filter(tol == "0.1") %>%
ggplot() +
geom_abline(intercept = 0, slope = 1, color = "grey") +
geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) +
facet_wrap("tol", ncol = 1) +
theme_bw() +
labs(x = "Target Ensemble Size",
y = "Stitched Ensemble Size")

stitched_ensemble_counts %>%
filter(tol == "0.13") %>%
ggplot() +
geom_abline(intercept = 0, slope = 1, color = "grey") +
geom_point(aes(ensemble_size, n, color = model, shape = experiment), size = 3, alpha = 0.5) +
facet_wrap("tol", ncol = 1) +
theme_bw() +
labs(x = "Target Ensemble Size",
y = "Stitched Ensemble Size")

Stitched Results
# Plot stiched results vs tol for a single model & experiment
# Note this a sloppy function, relies on data that exists in the global environment
# Args
# model_name: a string of the model name to plot
# experiment_name: a string of the experiment name to plot
plot_stitched_data <- function(model_name, experiment_name){
stitched_data %>%
left_join(stitched_ensemble_counts, by = c("tol", "experiment", "ensemble_size", "model")) %>%
filter(ensemble_size %in% c(5, 10, 20, 25)) %>%
filter(model == model_name & experiment == experiment_name) %>%
mutate(ens_label = paste0("size out ", n)) ->
data_to_plot
data_to_plot %>%
select(tol, ensemble_size, ens_label) %>%
distinct() ->
dat_text
ggplot() +
geom_line(data = data_to_plot, aes(year, value, groupby = stitching_id), alpha = 0.4) +
geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label), hjust = -0.05, vjust = -0.1) +
facet_grid(tol~ensemble_size) +
labs(title = model_name,
subtitle = experiment_name,
y = "deg C")
}
SSP245
“ACCESS-ESM1-5”
plot_stitched_data("ACCESS-ESM1-5", "ssp245")

“CanESM5”
plot_stitched_data("CanESM5", "ssp245")

“MIROC-ES2L”
plot_stitched_data( "MIROC-ES2L", "ssp245")

“MPI-ESM1-2-LR”
plot_stitched_data("MPI-ESM1-2-LR", "ssp245")

ssp370
“ACCESS-ESM1-5”
plot_stitched_data("ACCESS-ESM1-5", "ssp370")

“CanESM5”
plot_stitched_data("CanESM5", "ssp370")

“MIROC-ES2L”
plot_stitched_data( "MIROC-ES2L", "ssp370")

“MPI-ESM1-2-LR”
plot_stitched_data("MPI-ESM1-2-LR", "ssp370")

Errors
# Calculate the jumps between each step in the data, it only works with a data frame that conntains
# data with a data from a single source.
#
# Args
# data: the data frame of values to aclcculate the difference in
get_jumps <- function(data){
# Check the inputs!
assert_that(has_name(x = data, which = c("year", "value", "ensemble", "model")))
assert_that(length(unique(data$year)) == length(data$year), msg = "mulitple entries for a single year")
assert_that(all(diff(data$year) == 1))
# Get the jumps
jumps <- diff(data$value)
# Store the data in a data frame.
data %>%
select(model, ensemble, experiment) %>%
distinct() %>%
cbind(jumps = jumps) ->
out
return(out)
}
stitched_data %>%
filter(ensemble_size %in% c(5, 10, 20, 25)) %>%
filter(model != "ACCESS-ESM1-5") %>%
mutate(ensemble = paste0(stitching_id, "_", tol, "_", ensemble_size)) %>%
select(year, value, ensemble, model, experiment) %>%
distinct() %>%
split(., interaction(.$model, .$experiment, .$ensemble, drop = TRUE)) %>%
lapply(get_jumps) %>%
bind_rows ->
stitched_jumps
comp_data %>%
filter(model %in% stitched_jumps$model & experiment %in% stitched_jumps$experiment) %>%
split(., interaction(.$model, .$ensemble, .$experiment), drop = TRUE) %>%
lapply(get_jumps) %>%
bind_rows() ->
comp_jumps
# make the jump density plot
plot_jump_density <- function(data, model_name){
data %>%
filter(model == model_name) %>%
ggplot() +
geom_density(aes(jumps, fill = source), alpha = 0.2) +
facet_wrap("experiment") +
labs(y = "Density of Jumps", x = NULL, title = model_name) ->
out
return(out)
}
stitched_jumps$source <- "stitched"
# lapply(stitched_jumps$ensemble, function(x){
# unlist(strsplit(x, split = "_"))[2]
# }) %>%
# unlist ->
# tol
#
# lapply(stitched_jumps$ensemble, function(x){
# unlist(strsplit(x, split = "_"))[3]
# }) %>%
# unlist ->
# en_size
#
# stitched_jumps$tol <- tol
# stitched_jumps$en_size <- en_size
#
# stitched_jumps %>%
# mutate(source = paste0(source, "_", tol, "_", en_size)) ->
# stitched_jumps
comp_jumps$source <- "original"
jump_data <- bind_rows(stitched_jumps, comp_jumps)
plot_jump_density(data = jump_data, model_name = "MPI-ESM1-2-LR")

plot_jump_density(data = jump_data, model_name = "CanESM5")

plot_jump_density(data = jump_data, model_name = "MIROC-ES2L")

jump_data %>%
group_by(model, source, experiment) %>%
summarise(mean = mean(jumps),
sd = sd(jumps)) %>%
gather(mean, sd, key = "variable", value = "value") %>%
spread(source, value) %>%
mutate(diff = abs(original - stitched)/ original) %>%
select(model, experiment, variable, diff) %>%
spread(variable, diff) %>%
knitr::kable(caption = "jumps; abs(original - stitched)/original") %>%
kable_styling()
jumps; abs(original - stitched)/original
|
model
|
experiment
|
mean
|
sd
|
|
CanESM5
|
ssp245
|
0.0060603
|
0.0988185
|
|
CanESM5
|
ssp370
|
0.0102871
|
0.1071776
|
|
MIROC-ES2L
|
ssp245
|
0.0273818
|
0.0440580
|
|
MIROC-ES2L
|
ssp370
|
0.0345149
|
0.0121302
|
|
MPI-ESM1-2-LR
|
ssp245
|
0.0100443
|
0.0872434
|
|
MPI-ESM1-2-LR
|
ssp370
|
0.0040811
|
0.0730655
|
stitched_data %>%
filter(ensemble_size %in% c(5, 10, 20, 25)) %>%
filter(model != "ACCESS-ESM1-5") %>%
group_by(model, experiment) %>%
summarise(mean = mean(value),
sd = sd(value)) %>%
ungroup %>%
gather(mean, sd, key = "stat", value = "value") %>%
mutate(source = "stitched") ->
stitched_mean
comp_data %>%
filter(model %in% stitched_mean$model & experiment %in% stitched_mean$experiment) %>%
group_by(experiment, model) %>%
summarise(mean = mean(value),
sd = sd(value)) %>%
ungroup %>%
gather(mean, sd, key = "stat", value = "value") %>%
mutate(source = "original") ->
original_mean
bind_rows(stitched_mean, original_mean) %>%
spread(source, value) %>%
mutate(diff = abs(original - stitched) / original) %>%
select(model, experiment, diff, stat) %>%
spread(stat, diff) %>%
knitr::kable(caption = "mean data; abs(original - stitched)/original") %>%
kable_styling()
mean data; abs(original - stitched)/original
|
model
|
experiment
|
mean
|
sd
|
|
CanESM5
|
ssp245
|
0.1811518
|
0.0002302
|
|
CanESM5
|
ssp370
|
0.0196102
|
0.0010959
|
|
MIROC-ES2L
|
ssp245
|
0.0636297
|
0.0019242
|
|
MIROC-ES2L
|
ssp370
|
0.0944577
|
0.0046249
|
|
MPI-ESM1-2-LR
|
ssp245
|
-0.0510506
|
0.0028091
|
|
MPI-ESM1-2-LR
|
ssp370
|
0.2129963
|
0.0007331
|
Problems
So it looks like ACCESS-ESM1-5 has gaps in the data! Why is that, should that be legal or should that be illegal? Or shoudl we throw an error.
experiment_name <- "ssp245"
model_name <- "ACCESS-ESM1-5"
stitched_data %>%
left_join(stitched_ensemble_counts, by = c("tol", "experiment", "ensemble_size", "model")) %>%
filter(ensemble_size %in% c(5, 10, 20, 25)) %>%
filter(model == "ACCESS-ESM1-5" & experiment == "ssp245") %>%
mutate(ens_label = paste0("size out ", n)) ->
data_to_plot
data_to_plot %>%
select(tol, ensemble_size, ens_label) %>%
distinct() ->
dat_text
ggplot() +
geom_line(data = data_to_plot, aes(year, value, groupby = stitching_id, color = stitching_id ), alpha = 0.4) +
geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label), hjust = -0.05, vjust = -0.1) +
facet_grid(tol~ensemble_size) +
labs(title = model_name,
subtitle = experiment_name,
y = "deg C") +
theme(legend.position = "none")

data_to_plot %>%
filter(stitching_id == "ssp245~r19i1p1f1~1") %>%
ggplot() +
geom_point(aes(year, value, groupby = stitching_id, color = stitching_id ), alpha = 0.4) +
geom_text(data = dat_text, aes(x = -Inf, y = 2, label = ens_label), hjust = -0.05, vjust = -0.1) +
facet_grid(tol~ensemble_size) +
labs(title = "ACCESS-ESM1-5",
subtitle = experiment_name,
y = "deg C") +
theme(legend.position = "none")
