This is an R port and extension of the repsample stata package. It can be found at https://github.com/ljlasker/RepsampleR/.
This enables greedy resampling, as in the original package, alongside a few other types of sampling, mostly for the added speed.
install.packages("remotes")
remotes::install_github("ljlasker/RepsampleR")
library(RepsampleR)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
set.seed(1)
baseline <- data.frame(x = rnorm(10000, 0, 1))
target_n <- 1000
# Same objective for both methods
obj_mean_sd <- function(out) {
x <- out$data$x[out$data$repsample == 1L]
(mean(x) - 1)^2 + (sd(x) - 1)^2
}
common <- list(
data = baseline,
size = target_n,
cont = "x",
mean = 1,
sd = 1,
dist = "normal",
n_seeds = 16,
seed_start = 1,
n_outer_workers = 1,
outer_parallel = "serial",
objective = obj_mean_sd,
backend = "cpu",
n_cores = 1
)
t_g <- system.time({
fit_g <- do.call(repsample_search, c(common, list(
method = "greedy",
quality = "fast",
randomperc = 40,
rrule = 200,
srule = 0.8
)))
})
# Importance: fastest path
t_i <- system.time({
fit_i <- do.call(repsample_search, c(common, list(
method = "importance"
)))
})
pick_g <- fit_g$best$data$x[fit_g$best$data$repsample == 1L]
pick_i <- fit_i$best$data$x[fit_i$best$data$repsample == 1L]
cat("GREEDY mean/sd/time:", mean(pick_g), sd(pick_g), t_g["elapsed"], "\n")
## GREEDY mean/sd/time: 0.8094763 1.041915 40.546
cat("FAST mean/sd/time:", mean(pick_i), sd(pick_i), t_i["elapsed"], "\n")
## FAST mean/sd/time: 0.8991203 0.9553544 0.061
cat("FAST method:", fit_i$best$meta$method, "\n")
## FAST method: importance_sampling
# ----------------------------
# Separate plot 1: Baseline vs Greedy picked
# ----------------------------
plot_df_greedy <- rbind(
data.frame(x = baseline$x, group = "Baseline"),
data.frame(x = pick_g, group = "Greedy Picked")
)
p_greedy <- ggplot(plot_df_greedy, aes(x = x, color = group, fill = group)) +
geom_density(alpha = 0.20, linewidth = 1.0) +
geom_vline(xintercept = 1, linetype = "dashed", color = "black") +
labs(
title = "Baseline vs Greedy Picked Distribution",
subtitle = sprintf("Greedy Picked: Mean = %.4f, SD = %.4f", mean(pick_g), sd(pick_g)),
x = "x",
y = NULL,
fill = NULL, color = NULL
) +
theme_minimal(base_size = 13)
print(p_greedy)
# ----------------------------
# Separate plot 2: Baseline vs Importance picked
# ----------------------------
plot_df_importance <- rbind(
data.frame(x = baseline$x, group = "Baseline"),
data.frame(x = pick_i, group = "Importance Picked")
)
p_importance <- ggplot(plot_df_importance, aes(x = x, color = group, fill = group)) +
geom_density(alpha = 0.20, linewidth = 1.0) +
geom_vline(xintercept = 1, linetype = "dashed", color = "black") +
labs(
title = "Baseline vs Importance Picked Distribution",
subtitle = sprintf("Importance Picked: Mean = %.4f, SD = %.4f", mean(pick_i), sd(pick_i)),
x = "x",
y = NULL,
fill = NULL, color = NULL
) +
theme_minimal(base_size = 13)
print(p_importance)
If you want more accuracy, you can be strict, stick to greedy resampling, drop the sample size (but beware the looming trade-off!), or increase the number of seeds. Because I want to go fast and I’m not using CUDA for this demonstration on my Mac, I’ll just use importance resampling with a lot of seeds.
target_mean <- 1
target_sd <- 1
# Strongly prioritize matching mean/SD
obj_mean_sd <- function(out) {
x <- out$data$x[out$data$repsample == 1L]
1000 * (mean(x) - target_mean)^2 + 500 * (sd(x) - target_sd)^2
}
set.seed(123)
baseline <- data.frame(x = rnorm(10000, 0, 1))
t <- system.time({
fit_imp <- repsample_search(
data = baseline,
size = 1000,
cont = "x",
mean = 1,
sd = 1,
dist = "normal",
method = "importance",
objective = obj_mean_sd,
n_seeds = 10000, # increase for tighter match
seed_start = 1,
n_outer_workers = 16,
outer_parallel = "serial",
backend = "cpu",
n_cores = 1
)
})
x_best <- fit_imp$best$data$x[fit_imp$best$data$repsample == 1L]
cat("mean/sd/time:", mean(x_best), sd(x_best), t["elapsed"], "\n")
## mean/sd/time: 0.960408 0.950056 40.194
cat("method:", fit_imp$best$meta$method, "\n")
## method: importance_sampling
picked <- fit_imp$best$data$x[fit_imp$best$data$repsample == 1L]
# 1) Baseline vs selected (density)
plot_df <- rbind(
data.frame(x = baseline$x, group = "Baseline"),
data.frame(x = picked, group = "Selected")
)
p1 <- ggplot(plot_df, aes(x = x, color = group, fill = group)) +
geom_density(alpha = 0.2, linewidth = 1) +
geom_vline(xintercept = 1, linetype = "dashed") +
labs(
title = "Baseline vs Importance-Sampled Selection",
subtitle = sprintf(
"Selected: n=%d, Mean=%.4f, SD=%.4f | Target: Mean=1, SD=1",
length(picked), mean(picked), sd(picked)
),
x = "x", y = NULL,
fill = NULL, color = NULL
) +
theme_minimal(base_size = 13)
print(p1)
# 2) Selected-only with target normal overlay
grid_x <- seq(min(picked), max(picked), length.out = 400)
target_df <- data.frame(x = grid_x, y = dnorm(grid_x, mean = 1, sd = 1))
p2 <- ggplot(data.frame(x = picked), aes(x = x)) +
geom_density(color = "#1f77b4", linewidth = 1.1) +
geom_line(data = target_df, aes(x = x, y = y), color = "#d62728", linewidth = 1, inherit.aes = FALSE) +
geom_vline(xintercept = 1, linetype = "dashed") +
labs(
title = "Selected Distribution vs Target N(1,1)",
subtitle = "Blue: Importance-Selected Density | Red: Target Density",
x = "x", y = NULL,
fill = NULL, color = NULL
) +
theme_minimal(base_size = 13)
print(p2)
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_4.0.1 RepsampleR_0.1.0
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.5 knitr_1.46 rlang_1.1.7
## [5] xfun_0.52 highr_0.10 generics_0.1.3 S7_0.2.1
## [9] jsonlite_2.0.0 labeling_0.4.3 glue_1.8.0 htmltools_0.5.8.1
## [13] sass_0.4.9 fansi_1.0.6 scales_1.4.0 rmarkdown_2.29
## [17] grid_4.4.0 tibble_3.2.1 evaluate_1.0.5 jquerylib_0.1.4
## [21] fastmap_1.2.0 yaml_2.3.8 lifecycle_1.0.4 compiler_4.4.0
## [25] dplyr_1.1.4 RColorBrewer_1.1-3 pkgconfig_2.0.3 Rcpp_1.0.12
## [29] rstudioapi_0.16.0 farver_2.1.1 digest_0.6.35 R6_2.6.1
## [33] tidyselect_1.2.1 utf8_1.2.4 pillar_1.9.0 magrittr_2.0.3
## [37] bslib_0.7.0 withr_3.0.2 tools_4.4.0 gtable_0.3.6
## [41] cachem_1.1.0