Rsample Tutorial

rolling_origin

dat <- data.frame(day = 1:30)
# Resample by week instead of day
ts_cv <- rolling_origin(dat, initial = 7, assess = 7,
                        skip = 6, cumulative = FALSE)
ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

rolling_origin( data, initial = 5, assess = 1, cumulative = TRUE, skip = 0, lag = 0,

set.seed(1131)
ex_data <- data.frame(row = 1:20, some_var = rnorm(20))
dim(rolling_origin(ex_data))

## [1] 15  2

dim(rolling_origin(ex_data, skip = 2))

## [1] 5 2

dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE))

## [1] 5 2

ts_cv <- rolling_origin(ex_data)

ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

ts_cv <- rolling_origin(ex_data, skip = 2)

ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

ts_cv <- rolling_origin(ex_data, skip = 2, cumulative = FALSE)

ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

You can also roll over calendar periods by first nesting by that period, which is especially useful for irregular series where a fixed window is not useful. This example slides over 5 years at a time

library(dplyr)
library(tidyr)
data(drinks, package = "modeldata")

drinks_annual <- drinks %>%
  mutate(year = as.POSIXlt(date)$year + 1900) %>%
  nest(-year)

## Warning: All elements of `...` must be named.
## Did you want `data = c(date, S4248SM144NCEN)`?

multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE)

analysis(multi_year_roll$splits[[1]])

## # A tibble: 5 x 2
##    year data             
##   <dbl> <list>           
## 1  1992 <tibble [12 x 2]>
## 2  1993 <tibble [12 x 2]>
## 3  1994 <tibble [12 x 2]>
## 4  1995 <tibble [12 x 2]>
## 5  1996 <tibble [12 x 2]>

assessment(multi_year_roll$splits[[1]])

## # A tibble: 1 x 2
##    year data             
##   <dbl> <list>           
## 1  1997 <tibble [12 x 2]>

ts_cv <- tidy(multi_year_roll)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

library(vctrs)

## Warning: package 'vctrs' was built under R version 4.0.3

## 
## Attaching package: 'vctrs'

## The following object is masked from 'package:tibble':
## 
##     data_frame

## The following object is masked from 'package:dplyr':
## 
##     data_frame

library(tibble)
library(modeldata)
data("Chicago")

index <- new_date(c(1, 3, 4, 7, 8, 9, 13, 15, 16, 17))
df <- tibble(x = 1:10, index = index)
df

## # A tibble: 10 x 2
##        x index     
##    <int> <date>    
##  1     1 1970-01-02
##  2     2 1970-01-04
##  3     3 1970-01-05
##  4     4 1970-01-08
##  5     5 1970-01-09
##  6     6 1970-01-10
##  7     7 1970-01-14
##  8     8 1970-01-16
##  9     9 1970-01-17
## 10    10 1970-01-18

time resampling

Look back two rows beyond the current row, for a total of three rows in each analysis set. Each assessment set is composed of the two rows after the current row.

dfslide <- sliding_window(df, lookback = 2, assess_stop = 2)

dfslidetbl <- tidy(dfslide)

dfslidetbl

## # A tibble: 30 x 3
##      Row Data     Resample
##    <int> <chr>    <chr>   
##  1     1 Analysis Slice1  
##  2     2 Analysis Slice1  
##  3     2 Analysis Slice2  
##  4     3 Analysis Slice1  
##  5     3 Analysis Slice2  
##  6     3 Analysis Slice3  
##  7     4 Analysis Slice2  
##  8     4 Analysis Slice3  
##  9     4 Analysis Slice4  
## 10     5 Analysis Slice3  
## # ... with 20 more rows

ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

Same as before, but step forward by 3 rows between each resampling slice, rather than just by 1.

dfslide <- sliding_window(df, lookback = 2, assess_stop = 2, step = 3)

dfslidetbl <- tidy(dfslide)

dfslidetbl

## # A tibble: 10 x 3
##      Row Data       Resample
##    <int> <chr>      <chr>   
##  1     1 Analysis   Slice1  
##  2     2 Analysis   Slice1  
##  3     3 Analysis   Slice1  
##  4     4 Analysis   Slice2  
##  5     5 Analysis   Slice2  
##  6     6 Analysis   Slice2  
##  7     4 Assessment Slice1  
##  8     5 Assessment Slice1  
##  9     7 Assessment Slice2  
## 10     8 Assessment Slice2

ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

Now slide relative to the index column in df. This time we look back 2 days from the current row’s index value, and 2 days forward from it to construct the assessment set. Note that this series is irregular, so it produces different results than sliding_window(). Additionally, note that it is entirely possible for the assessment set to contain no data if you have a highly irregular series and “look forward” into a date range where no data points actually exist!

dfslide <- sliding_index(df, index, lookback = 2, assess_stop = 2)

dfslidetbl <- tidy(dfslide)

dfslidetbl

## # A tibble: 20 x 3
##      Row Data       Resample
##    <int> <chr>      <chr>   
##  1     1 Analysis   Slice1  
##  2     2 Analysis   Slice1  
##  3     2 Analysis   Slice2  
##  4     3 Analysis   Slice2  
##  5     4 Analysis   Slice3  
##  6     4 Analysis   Slice4  
##  7     4 Analysis   Slice5  
##  8     5 Analysis   Slice4  
##  9     5 Analysis   Slice5  
## 10     6 Analysis   Slice5  
## 11     7 Analysis   Slice6  
## 12     7 Analysis   Slice7  
## 13     8 Analysis   Slice7  
## 14     3 Assessment Slice1  
## 15     5 Assessment Slice3  
## 16     6 Assessment Slice3  
## 17     6 Assessment Slice4  
## 18     8 Assessment Slice6  
## 19     9 Assessment Slice7  
## 20    10 Assessment Slice7

ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

With sliding_period(), we can break up our date index into more granular chunks, and slide over them instead of the index directly. Here we’ll use the Chicago data, which contains daily data spanning 16 years, and we’ll break it up into rolling yearly chunks. Three years worth of data will be used for the analysis set, and one years worth of data will be held out for performance assessment

dfslide <- sliding_period(
  Chicago,
  date,
  "year",
  lookback = 2,
  assess_stop = 1
)

dfslidetbl <- tidy(dfslide)

dfslidetbl

## # A tibble: 18,847 x 3
##      Row Data     Resample
##    <int> <chr>    <chr>   
##  1     1 Analysis Slice01 
##  2     2 Analysis Slice01 
##  3     3 Analysis Slice01 
##  4     4 Analysis Slice01 
##  5     5 Analysis Slice01 
##  6     6 Analysis Slice01 
##  7     7 Analysis Slice01 
##  8     8 Analysis Slice01 
##  9     9 Analysis Slice01 
## 10    10 Analysis Slice01 
## # ... with 18,837 more rows

ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

Because lookback = 2, three years are required to form a “complete” window of data. To allow partial windows, set complete = FALSE. Here that first constructs two expanding windows until a complete three year window can be formed, at which point we switch to a sliding window.

dfslide <- sliding_period(
  Chicago,
  date,
  "year",
  lookback = 2,
  assess_stop = 1,
  complete = FALSE
)

dfslidetbl <- tidy(dfslide)

dfslidetbl

## # A tibble: 20,630 x 3
##      Row Data     Resample
##    <int> <chr>    <chr>   
##  1     1 Analysis Slice01 
##  2     1 Analysis Slice02 
##  3     1 Analysis Slice03 
##  4     2 Analysis Slice01 
##  5     2 Analysis Slice02 
##  6     2 Analysis Slice03 
##  7     3 Analysis Slice01 
##  8     3 Analysis Slice02 
##  9     3 Analysis Slice03 
## 10     4 Analysis Slice01 
## # ... with 20,620 more rows

ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

Alternatively, you could break the resamples up by month. Here we’ll use an expanding monthly window by setting lookback = Inf, and each assessment set will contain two months of data. To ensure that we have enough data to fit our models, we’ll skip the first 4 expanding windows. Finally, to thin out the results, we’ll step forward by 2 between each resample.

dfslide <- sliding_period(
  Chicago,
  date,
  "month",
  lookback = Inf,
  assess_stop = 2,
  skip = 4,
  step = 2
)

dfslidetbl <- tidy(dfslide)

dfslidetbl

## # A tibble: 266,672 x 3
##      Row Data     Resample
##    <int> <chr>    <chr>   
##  1     1 Analysis Slice01 
##  2     1 Analysis Slice02 
##  3     1 Analysis Slice03 
##  4     1 Analysis Slice04 
##  5     1 Analysis Slice05 
##  6     1 Analysis Slice06 
##  7     1 Analysis Slice07 
##  8     1 Analysis Slice08 
##  9     1 Analysis Slice09 
## 10     1 Analysis Slice10 
## # ... with 266,662 more rows

ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
  geom_tile() + scale_fill_brewer()

rsample

Ginanjar Utama

27/12/2020

Rsample Tutorial

rolling_origin

time resampling