Background

This is an R port and extension of the repsample stata package. It can be found at https://github.com/ljlasker/RepsampleR/.

This enables greedy resampling, as in the original package, alongside a few other types of sampling, mostly for the added speed.

Install

install.packages("remotes")
remotes::install_github("ljlasker/RepsampleR")

Usage

library(RepsampleR)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
set.seed(1)
baseline <- data.frame(x = rnorm(10000, 0, 1))
target_n <- 1000

# Same objective for both methods
obj_mean_sd <- function(out) {
  x <- out$data$x[out$data$repsample == 1L]
  (mean(x) - 1)^2 + (sd(x) - 1)^2
}

common <- list(
  data = baseline,
  size = target_n,
  cont = "x",
  mean = 1,
  sd = 1,
  dist = "normal",
  n_seeds = 16,            
  seed_start = 1,
  n_outer_workers = 1,
  outer_parallel = "serial",
  objective = obj_mean_sd,
  backend = "cpu",
  n_cores = 1
)

t_g <- system.time({
  fit_g <- do.call(repsample_search, c(common, list(
    method = "greedy",
    quality = "fast",     
    randomperc = 40,
    rrule = 200,
    srule = 0.8
  )))
})

# Importance: fastest path
t_i <- system.time({
  fit_i <- do.call(repsample_search, c(common, list(
    method = "importance"
  )))
})

pick_g <- fit_g$best$data$x[fit_g$best$data$repsample == 1L]
pick_i <- fit_i$best$data$x[fit_i$best$data$repsample == 1L]

cat("GREEDY mean/sd/time:", mean(pick_g), sd(pick_g), t_g["elapsed"], "\n")
## GREEDY mean/sd/time: 0.8094763 1.041915 40.546
cat("FAST   mean/sd/time:", mean(pick_i), sd(pick_i), t_i["elapsed"], "\n")
## FAST   mean/sd/time: 0.8991203 0.9553544 0.061
cat("FAST method:", fit_i$best$meta$method, "\n")
## FAST method: importance_sampling
# ----------------------------
# Separate plot 1: Baseline vs Greedy picked
# ----------------------------
plot_df_greedy <- rbind(
  data.frame(x = baseline$x, group = "Baseline"),
  data.frame(x = pick_g, group = "Greedy Picked")
)

p_greedy <- ggplot(plot_df_greedy, aes(x = x, color = group, fill = group)) +
  geom_density(alpha = 0.20, linewidth = 1.0) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "black") +
  labs(
    title = "Baseline vs Greedy Picked Distribution",
    subtitle = sprintf("Greedy Picked: Mean = %.4f, SD = %.4f", mean(pick_g), sd(pick_g)),
    x = "x",
    y = NULL,
    fill = NULL, color = NULL
  ) +
  theme_minimal(base_size = 13)

print(p_greedy)

# ----------------------------
# Separate plot 2: Baseline vs Importance picked
# ----------------------------
plot_df_importance <- rbind(
  data.frame(x = baseline$x, group = "Baseline"),
  data.frame(x = pick_i, group = "Importance Picked")
)

p_importance <- ggplot(plot_df_importance, aes(x = x, color = group, fill = group)) +
  geom_density(alpha = 0.20, linewidth = 1.0) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "black") +
  labs(
    title = "Baseline vs Importance Picked Distribution",
    subtitle = sprintf("Importance Picked: Mean = %.4f, SD = %.4f", mean(pick_i), sd(pick_i)),
    x = "x",
    y = NULL,
    fill = NULL, color = NULL
  ) +
  theme_minimal(base_size = 13)

print(p_importance)

If you want more accuracy, you can be strict, stick to greedy resampling, drop the sample size (but beware the looming trade-off!), or increase the number of seeds. Because I want to go fast and I’m not using CUDA for this demonstration on my Mac, I’ll just use importance resampling with a lot of seeds.

target_mean <- 1
target_sd   <- 1

# Strongly prioritize matching mean/SD
obj_mean_sd <- function(out) {
  x <- out$data$x[out$data$repsample == 1L]
  1000 * (mean(x) - target_mean)^2 + 500 * (sd(x) - target_sd)^2
}

set.seed(123)
baseline <- data.frame(x = rnorm(10000, 0, 1))

t <- system.time({
  fit_imp <- repsample_search(
    data = baseline,
    size = 1000,
    cont = "x",
    mean = 1,
    sd = 1,
    dist = "normal",
    method = "importance",
    objective = obj_mean_sd,
    n_seeds = 10000,          # increase for tighter match
    seed_start = 1,
    n_outer_workers = 16,
    outer_parallel = "serial",
    backend = "cpu",
    n_cores = 1
  )
})

x_best <- fit_imp$best$data$x[fit_imp$best$data$repsample == 1L]
cat("mean/sd/time:", mean(x_best), sd(x_best), t["elapsed"], "\n")
## mean/sd/time: 0.960408 0.950056 40.194
cat("method:", fit_imp$best$meta$method, "\n")
## method: importance_sampling
picked <- fit_imp$best$data$x[fit_imp$best$data$repsample == 1L]

# 1) Baseline vs selected (density)
plot_df <- rbind(
  data.frame(x = baseline$x, group = "Baseline"),
  data.frame(x = picked, group = "Selected")
)

p1 <- ggplot(plot_df, aes(x = x, color = group, fill = group)) +
  geom_density(alpha = 0.2, linewidth = 1) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  labs(
    title = "Baseline vs Importance-Sampled Selection",
    subtitle = sprintf(
      "Selected: n=%d, Mean=%.4f, SD=%.4f | Target: Mean=1, SD=1",
      length(picked), mean(picked), sd(picked)
    ),
    x = "x", y = NULL,
    fill = NULL, color = NULL
  ) +
  theme_minimal(base_size = 13)

print(p1)

# 2) Selected-only with target normal overlay
grid_x <- seq(min(picked), max(picked), length.out = 400)
target_df <- data.frame(x = grid_x, y = dnorm(grid_x, mean = 1, sd = 1))

p2 <- ggplot(data.frame(x = picked), aes(x = x)) +
  geom_density(color = "#1f77b4", linewidth = 1.1) +
  geom_line(data = target_df, aes(x = x, y = y), color = "#d62728", linewidth = 1, inherit.aes = FALSE) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  labs(
    title = "Selected Distribution vs Target N(1,1)",
    subtitle = "Blue: Importance-Selected Density | Red: Target Density",
    x = "x", y = NULL,
    fill = NULL, color = NULL
  ) +
  theme_minimal(base_size = 13)

print(p2)

Session Info

sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_4.0.1    RepsampleR_0.1.0
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.5          knitr_1.46         rlang_1.1.7       
##  [5] xfun_0.52          highr_0.10         generics_0.1.3     S7_0.2.1          
##  [9] jsonlite_2.0.0     labeling_0.4.3     glue_1.8.0         htmltools_0.5.8.1 
## [13] sass_0.4.9         fansi_1.0.6        scales_1.4.0       rmarkdown_2.29    
## [17] grid_4.4.0         tibble_3.2.1       evaluate_1.0.5     jquerylib_0.1.4   
## [21] fastmap_1.2.0      yaml_2.3.8         lifecycle_1.0.4    compiler_4.4.0    
## [25] dplyr_1.1.4        RColorBrewer_1.1-3 pkgconfig_2.0.3    Rcpp_1.0.12       
## [29] rstudioapi_0.16.0  farver_2.1.1       digest_0.6.35      R6_2.6.1          
## [33] tidyselect_1.2.1   utf8_1.2.4         pillar_1.9.0       magrittr_2.0.3    
## [37] bslib_0.7.0        withr_3.0.2        tools_4.4.0        gtable_0.3.6      
## [41] cachem_1.1.0