R Future

The purpose of the future package is to provide a very simple and uniform way of evaluating R expressions asynchronously using various resources available to the user. web page

library(future)
library(tictoc)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(furrr)
plan(multisession)

Thsi function just sleeps and print a message after waking up

sleep_and_print = function(time){
  Sys.sleep(time)
  return("Time over")
}

A normal assignment will require some waiting before we can do any other computation

tic()
a <- sleep_and_print(10)
toc()
## 10.015 sec elapsed
a
## [1] "Time over"
tic()
b %<-% sleep_and_print(10)
toc()
## 0.09 sec elapsed
print("2 + 2 can be executed while sleeping")
## [1] "2 + 2 can be executed while sleeping"
2 + 2
## [1] 4
b
## [1] "Time over"
print("This will have to wait")
## [1] "This will have to wait"

Purrr

purrr enhances R’s functional programming (FP) toolkit by providing a complete and consistent set of tools for working with functions and vectors. If you’ve never heard of FP before, the best place to start is the family of map() functions which allow you to replace many for loops with code that is both more succinct and easier to read. The best place to learn about the map() functions is the iteration chapter in R for data science. web page

library(purrr)

What is the nest/map framework

my_tibble = 
  tibble(
  label = c("a","a","a","a","a","b","b","b","b","b" ),
  value = 1:10
)
my_tibble
## # A tibble: 10 x 2
##    label value
##    <chr> <int>
##  1 a         1
##  2 a         2
##  3 a         3
##  4 a         4
##  5 a         5
##  6 b         6
##  7 b         7
##  8 b         8
##  9 b         9
## 10 b        10
my_tibble_nested = 
  my_tibble %>%
  nest(data = value)
my_tibble_nested
## # A tibble: 2 x 2
##   label data            
##   <chr> <list>          
## 1 a     <tibble [5 × 1]>
## 2 b     <tibble [5 × 1]>
# Summarise
my_tibble_nested %>%
  mutate(average = map_dbl(data, ~ mean(.x$value)))
## # A tibble: 2 x 3
##   label data             average
##   <chr> <list>             <dbl>
## 1 a     <tibble [5 × 1]>       3
## 2 b     <tibble [5 × 1]>       8
# Summarise + filter
my_tibble_nested %>%
  mutate(average = map_dbl(data, ~ mean(.x$value))) %>%
  filter(average == max(average)) %>%
  unnest(data)
## # A tibble: 5 x 3
##   label value average
##   <chr> <int>   <dbl>
## 1 b         6       8
## 2 b         7       8
## 3 b         8       8
## 4 b         9       8
## 5 b        10       8
# Summarise + update
my_tibble_nested %>%
  mutate(average = map_dbl(data, ~ mean(.x$value))) %>%
  mutate(data = map2(data, average, ~ .x - .y )) %>%
  unnest(data)
## # A tibble: 10 x 3
##    label value average
##    <chr> <dbl>   <dbl>
##  1 a        -2       3
##  2 a        -1       3
##  3 a         0       3
##  4 a         1       3
##  5 a         2       3
##  6 b        -2       8
##  7 b        -1       8
##  8 b         0       8
##  9 b         1       8
## 10 b         2       8

With furrr, you can easily parallelise computation over nested data frames

# Summarise + update
my_tibble_nested %>%
  mutate(average = future_map_dbl(data, ~ mean(.x$value))) %>%
  mutate(data = future_map2(data, average, ~ .x - .y )) %>%
  unnest(data)
## # A tibble: 10 x 3
##    label value average
##    <chr> <dbl>   <dbl>
##  1 a        -2       3
##  2 a        -1       3
##  3 a         0       3
##  4 a         1       3
##  5 a         2       3
##  6 b        -2       8
##  7 b        -1       8
##  8 b         0       8
##  9 b         1       8
## 10 b         2       8

The key, unfortunately, is to not use purrr when your session is heavily loaded (many Gb of data loased), or when each cell of the data column contains a huge amount of data. This is because the data tranfer can heavily affect the performances.

Parallelising over the HPC

library(future.batchtools)

cores = 8
local_dir = "ABSOLUTE_PATH_TO_MY_HOME_DIRECTORY"
#!/bin/bash

#SBATCH --job-name=<%= job.name %>
#SBATCH --output=<%= log.file %>
#SBATCH --error=<%= log.file %>
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=<%= resources[["cores"]] %>
#SBATCH --mem-per-cpu=<%= resources[["memory_mb"]] %>
#SBATCH --time=<%= resources[["time"]] %>

<%= if (array.jobs) sprintf("#SBATCH --array=1-%i", nrow(jobs)) else "" %>
Rscript -e 'batchtools::doJobCollection("<%= uri %>")'
slurm <- future::tweak(
  batchtools_slurm,
  template = "slurm_batchtools.tmpl",
  resources=list(
    cores = cores,
    memory_mb = 5000,
    time = "48:00:00"
  )
)

plan(slurm)

# Summarise + update
my_tibble_nested %>%
  mutate(average = future_map_dbl(data, ~ mean(.x$value))) %>%
  mutate(data = future_map2(data, average, ~ .x - .y )) %>%
  unnest(data)