1 Goal

This tutorial was designed to show how to use the R functions to match based on the nearest neighbor and use these matches to stitch the synthetic data together into a ts. For now the target data is actual ESM data so that we can compare the stitched data to existing data to validate our emulation process.

2 Set Up

# required packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)

# The load the functions we will want to use, these are currently written in R and will be translated into python. 
source("stitches_dev/nearest_neighbor_matching.R") # the function match_nearest_neighbor is defined here
source("stitches_dev/stitching_functions.R")  # the function stitch_global_mean is definend here 

Load the inputs will be used in the tutorial.

# Time sereis of the raw global mean temperature anomoly, this is the data that is going to be stitched together.
tgav_data <- read.csv("stitches_dev/inputs/main_raw_pasted_tgav_anomaly_all_pangeo_list_models.csv", stringsAsFactors = FALSE)

# A chunked smoothed tgav anomaly archive archive of data. This was tgav_data but several steps were taken in python to 
# transform it into the "chunked" data and save it so that we do not have to repeate this process so many times. 
archive_data <- read.csv('stitches_dev/inputs/archive_data.csv', stringsAsFactors = FALSE)



# This is chunked smooth tgav anomaly for a single model/experiment/ensemble memeber, saved for our convience. 
# If you decide that you want to work with a different different target data subset you can subset it from the archive_data daata frame. 
target_data <- read.csv("stitches_dev/inputs/target_data.csv", stringsAsFactors = FALSE)

3 Matching

Use the function match_nearest_neighbor to match the target and archive data to one another. Right now the user will have to manually subset the the contents of the archive so that we are not matching to multiple models or on its self.

Now that we have the subset of the archive we want to work with, we can use it in the nearest neighbor.

archive_data %>% 
  dplyr::filter(model == unique(target_data$model)) %>% 
  dplyr::filter(experiment != unique(target_data$experiment)) -> 
  archive_subset

Match the target and archive data!

matched_data <- match_nearest_neighbor(target_data = target_data, archive_data = archive_subset)

It will return a rather large data frame, with 28 rows and 21 columns. It will contain information about the target and archive data matched together. Columns that contain information about the distance of the matched pairs are dist_dx, dist_fx, and dst_l2. dist_dx and dist_fx reflect the distance between the dx and fx components. dist_l2 contains the euclidean distance, this is the distance that we use to select the nearest neighbor.

head(matched_data)
matched_data %>%
  ggplot() + 
  geom_density(aes(dist_dx, fill = "dist_dx"), alpha = 0.5) + 
  geom_density(aes(dist_fx, fill = "dist_fx"), alpha = 0.5) + 
  geom_density(aes(dist_l2, fill = "dist_l2"), alpha = 0.5) + 
  theme_bw() + 
  labs(title = "Compare Distance", 
       y = "Density", x = "Distance Value")

Visualize the matched points.

ggplot(data = matched_data) + 
  # Add the subset of the data that was read into the matching process. 
  geom_point(data = archive_subset, aes(fx, dx, color = "no match"), alpha = 0.4) + 
  # Add the matched together points and make clear which points are matched with which. 
  geom_point(aes(archive_fx, archive_dx, color = "matched archive data")) + 
  geom_point(aes(target_fx, target_dx,  color = "target data"), alpha = 0.4) + 
  # Add lines between the matched values
  geom_segment(aes(x = target_fx, y = target_dx, xend = archive_fx, yend =  archive_dx), alpha = 0.4) +
 scale_color_manual(values = c("matched archive data" = "red", 
                               "target data" = "blue", "no match" = "grey"))+
  theme_bw() + 
  labs(y = "dx (rate of change per chunk, degC/year)", 
       x = "fx (value of median temperature per chunk, degC)", 
       title = "Matching betweenn target and archive" )

4 Stitching

Subset the comparison data.

tgav_data %>%  
  filter(model == unique(target_data$model) & experiment == unique(target_data$experiment) & 
           ensemble == unique(target_data$ensemble)) -> 
  original_data

Now that we have a data frame with matched information use the function stitch_global_mean to produce the stiched data.

out1 <- stitch_global_mean(match = matched_data, data = tgav_data)
## `summarise()` regrouping output by 'model', 'experiment', 'ensemble', 'timestep', 'grid_type' (override with `.groups` argument)
out1$scn <- "stitched 1"
ggplot(data = out1) + 
  geom_line(data = original_data, aes(year, value, color = "ESM data")) +
  geom_line(aes(year, value, color = "stitched data")) + 
  theme_bw() + 
  labs(x = "Year", y = "Degree C", title = "Comparison of ESM and stitched data")

5 Full Pipeline

If you want to do it all together in a single pipeline this is how it would be done.

rslts <- stitch_global_mean(match_nearest_neighbor(target_data, archive_subset), data = tgav_data)
## `summarise()` regrouping output by 'model', 'experiment', 'ensemble', 'timestep', 'grid_type' (override with `.groups` argument)
head(rslts)

Change the stiched outputs by changing the experiments included in the archive.

# Subset the archive data to exlude a single experiment at a time.
archive_subset %>% 
  filter(experiment != "ssp119") %>% 
  # Use that data to match with the target data. 
  match_nearest_neighbor(target_data = target_data, archive_data = .) %>% 
  # Use the matches to stitch the data together.
  stitch_global_mean(match = ., data = tgav_data) %>% 
  mutate(scn = "scn 1") -> 
  out1
## `summarise()` regrouping output by 'model', 'experiment', 'ensemble', 'timestep', 'grid_type' (override with `.groups` argument)
archive_subset %>% 
  filter(experiment != "ssp126") %>% 
  match_nearest_neighbor(target_data = target_data, archive_data = .) %>% 
  stitch_global_mean(match = ., data = tgav_data) %>% 
  mutate(scn = "scn 2") -> 
  out2
## `summarise()` regrouping output by 'model', 'experiment', 'ensemble', 'timestep', 'grid_type' (override with `.groups` argument)
archive_subset %>% 
  filter(experiment != "ssp370") %>% 
  match_nearest_neighbor(target_data = target_data, archive_data = .) %>% 
  stitch_global_mean(match = ., data = tgav_data) %>% 
  mutate(scn = "scn 3") -> 
  out3
## `summarise()` regrouping output by 'model', 'experiment', 'ensemble', 'timestep', 'grid_type' (override with `.groups` argument)
rslts <- rbind(out1, out2, out3)
ggplot() + 
  geom_line(data = original_data, aes(year, value), color = "grey") + 
  geom_line(data = rslts, aes(year, value, color = scn)) + 
  theme_bw() + 
  labs(title = "Comparison of ESM and Stitched Data\nCreated from different archives",
       x = "Year", y = "Deg C")