dat <- data.frame(day = 1:30)
# Resample by week instead of day
ts_cv <- rolling_origin(dat, initial = 7, assess = 7,
skip = 6, cumulative = FALSE)
ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
rolling_origin( data, initial = 5, assess = 1, cumulative = TRUE, skip = 0, lag = 0,
set.seed(1131)
ex_data <- data.frame(row = 1:20, some_var = rnorm(20))
dim(rolling_origin(ex_data))
## [1] 15 2
dim(rolling_origin(ex_data, skip = 2))
## [1] 5 2
dim(rolling_origin(ex_data, skip = 2, cumulative = FALSE))
## [1] 5 2
ts_cv <- rolling_origin(ex_data)
ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
ts_cv <- rolling_origin(ex_data, skip = 2)
ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
ts_cv <- rolling_origin(ex_data, skip = 2, cumulative = FALSE)
ts_cv <- tidy(ts_cv)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
You can also roll over calendar periods by first nesting by that period, which is especially useful for irregular series where a fixed window is not useful. This example slides over 5 years at a time
library(dplyr)
library(tidyr)
data(drinks, package = "modeldata")
drinks_annual <- drinks %>%
mutate(year = as.POSIXlt(date)$year + 1900) %>%
nest(-year)
## Warning: All elements of `...` must be named.
## Did you want `data = c(date, S4248SM144NCEN)`?
multi_year_roll <- rolling_origin(drinks_annual, cumulative = FALSE)
analysis(multi_year_roll$splits[[1]])
## # A tibble: 5 x 2
## year data
## <dbl> <list>
## 1 1992 <tibble [12 x 2]>
## 2 1993 <tibble [12 x 2]>
## 3 1994 <tibble [12 x 2]>
## 4 1995 <tibble [12 x 2]>
## 5 1996 <tibble [12 x 2]>
assessment(multi_year_roll$splits[[1]])
## # A tibble: 1 x 2
## year data
## <dbl> <list>
## 1 1997 <tibble [12 x 2]>
ts_cv <- tidy(multi_year_roll)
ggplot(ts_cv, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
library(vctrs)
## Warning: package 'vctrs' was built under R version 4.0.3
##
## Attaching package: 'vctrs'
## The following object is masked from 'package:tibble':
##
## data_frame
## The following object is masked from 'package:dplyr':
##
## data_frame
library(tibble)
library(modeldata)
data("Chicago")
index <- new_date(c(1, 3, 4, 7, 8, 9, 13, 15, 16, 17))
df <- tibble(x = 1:10, index = index)
df
## # A tibble: 10 x 2
## x index
## <int> <date>
## 1 1 1970-01-02
## 2 2 1970-01-04
## 3 3 1970-01-05
## 4 4 1970-01-08
## 5 5 1970-01-09
## 6 6 1970-01-10
## 7 7 1970-01-14
## 8 8 1970-01-16
## 9 9 1970-01-17
## 10 10 1970-01-18
Look back two rows beyond the current row, for a total of three rows in each analysis set. Each assessment set is composed of the two rows after the current row.
dfslide <- sliding_window(df, lookback = 2, assess_stop = 2)
dfslidetbl <- tidy(dfslide)
dfslidetbl
## # A tibble: 30 x 3
## Row Data Resample
## <int> <chr> <chr>
## 1 1 Analysis Slice1
## 2 2 Analysis Slice1
## 3 2 Analysis Slice2
## 4 3 Analysis Slice1
## 5 3 Analysis Slice2
## 6 3 Analysis Slice3
## 7 4 Analysis Slice2
## 8 4 Analysis Slice3
## 9 4 Analysis Slice4
## 10 5 Analysis Slice3
## # ... with 20 more rows
ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
Same as before, but step forward by 3 rows between each resampling slice, rather than just by 1.
dfslide <- sliding_window(df, lookback = 2, assess_stop = 2, step = 3)
dfslidetbl <- tidy(dfslide)
dfslidetbl
## # A tibble: 10 x 3
## Row Data Resample
## <int> <chr> <chr>
## 1 1 Analysis Slice1
## 2 2 Analysis Slice1
## 3 3 Analysis Slice1
## 4 4 Analysis Slice2
## 5 5 Analysis Slice2
## 6 6 Analysis Slice2
## 7 4 Assessment Slice1
## 8 5 Assessment Slice1
## 9 7 Assessment Slice2
## 10 8 Assessment Slice2
ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
Now slide relative to the index column in df. This time we look back 2 days from the current row’s index value, and 2 days forward from it to construct the assessment set. Note that this series is irregular, so it produces different results than sliding_window(). Additionally, note that it is entirely possible for the assessment set to contain no data if you have a highly irregular series and “look forward” into a date range where no data points actually exist!
dfslide <- sliding_index(df, index, lookback = 2, assess_stop = 2)
dfslidetbl <- tidy(dfslide)
dfslidetbl
## # A tibble: 20 x 3
## Row Data Resample
## <int> <chr> <chr>
## 1 1 Analysis Slice1
## 2 2 Analysis Slice1
## 3 2 Analysis Slice2
## 4 3 Analysis Slice2
## 5 4 Analysis Slice3
## 6 4 Analysis Slice4
## 7 4 Analysis Slice5
## 8 5 Analysis Slice4
## 9 5 Analysis Slice5
## 10 6 Analysis Slice5
## 11 7 Analysis Slice6
## 12 7 Analysis Slice7
## 13 8 Analysis Slice7
## 14 3 Assessment Slice1
## 15 5 Assessment Slice3
## 16 6 Assessment Slice3
## 17 6 Assessment Slice4
## 18 8 Assessment Slice6
## 19 9 Assessment Slice7
## 20 10 Assessment Slice7
ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
With sliding_period(), we can break up our date index into more granular chunks, and slide over them instead of the index directly. Here we’ll use the Chicago data, which contains daily data spanning 16 years, and we’ll break it up into rolling yearly chunks. Three years worth of data will be used for the analysis set, and one years worth of data will be held out for performance assessment
dfslide <- sliding_period(
Chicago,
date,
"year",
lookback = 2,
assess_stop = 1
)
dfslidetbl <- tidy(dfslide)
dfslidetbl
## # A tibble: 18,847 x 3
## Row Data Resample
## <int> <chr> <chr>
## 1 1 Analysis Slice01
## 2 2 Analysis Slice01
## 3 3 Analysis Slice01
## 4 4 Analysis Slice01
## 5 5 Analysis Slice01
## 6 6 Analysis Slice01
## 7 7 Analysis Slice01
## 8 8 Analysis Slice01
## 9 9 Analysis Slice01
## 10 10 Analysis Slice01
## # ... with 18,837 more rows
ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
Because lookback = 2, three years are required to form a “complete” window of data. To allow partial windows, set complete = FALSE. Here that first constructs two expanding windows until a complete three year window can be formed, at which point we switch to a sliding window.
dfslide <- sliding_period(
Chicago,
date,
"year",
lookback = 2,
assess_stop = 1,
complete = FALSE
)
dfslidetbl <- tidy(dfslide)
dfslidetbl
## # A tibble: 20,630 x 3
## Row Data Resample
## <int> <chr> <chr>
## 1 1 Analysis Slice01
## 2 1 Analysis Slice02
## 3 1 Analysis Slice03
## 4 2 Analysis Slice01
## 5 2 Analysis Slice02
## 6 2 Analysis Slice03
## 7 3 Analysis Slice01
## 8 3 Analysis Slice02
## 9 3 Analysis Slice03
## 10 4 Analysis Slice01
## # ... with 20,620 more rows
ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()
Alternatively, you could break the resamples up by month. Here we’ll use an expanding monthly window by setting lookback = Inf, and each assessment set will contain two months of data. To ensure that we have enough data to fit our models, we’ll skip the first 4 expanding windows. Finally, to thin out the results, we’ll step forward by 2 between each resample.
dfslide <- sliding_period(
Chicago,
date,
"month",
lookback = Inf,
assess_stop = 2,
skip = 4,
step = 2
)
dfslidetbl <- tidy(dfslide)
dfslidetbl
## # A tibble: 266,672 x 3
## Row Data Resample
## <int> <chr> <chr>
## 1 1 Analysis Slice01
## 2 1 Analysis Slice02
## 3 1 Analysis Slice03
## 4 1 Analysis Slice04
## 5 1 Analysis Slice05
## 6 1 Analysis Slice06
## 7 1 Analysis Slice07
## 8 1 Analysis Slice08
## 9 1 Analysis Slice09
## 10 1 Analysis Slice10
## # ... with 266,662 more rows
ggplot(dfslidetbl, aes(x = Resample, y = factor(Row), fill = Data)) +
geom_tile() + scale_fill_brewer()