#include "Rcpp.h"
// [[Rcpp::export]]
double normal_sum1(SEXP x) {
double res = 0;
double* loc = REAL(x);
double* end = loc + Rf_xlength(x);
while (loc != end) {
res += *loc++;
}
return res;
}
// [[Rcpp::export]]
double normal_sum2(SEXP x) {
R_xlen_t n = Rf_xlength(x);
double res = 0;
double* loc = REAL(x);
for (R_xlen_t i = 0; i < n; ++i) {
res += loc[i];
}
return res;
}
// [[Rcpp::export]]
double normal_sum3(SEXP x) {
R_xlen_t n = Rf_xlength(x);
double res = 0;
for (R_xlen_t i = 0; i < n; ++i) {
res += REAL(x)[i];
}
return res;
}
// [[Rcpp::export]]
SEXP normal_times_100(SEXP x) {
SEXP res = PROTECT(Rf_allocVector(REALSXP, Rf_xlength(x)));
double* res_p = REAL(res);
double* loc = REAL(x);
double* end = loc + Rf_xlength(x);
while (loc != end) {
*res_p++ = *loc++ * 100. + 1;
}
UNPROTECT(1);
return res;
}
// [[Rcpp::export]]
double altrep_sum(SEXP x) {
R_xlen_t n = Rf_xlength(x);
double res = 0;
for (R_xlen_t i = 0; i < n; ++i) {
res += REAL_ELT(x, i);
}
return res;
}
// [[Rcpp::export]]
SEXP altrep_times_100(SEXP x) {
R_xlen_t n = Rf_xlength(x);
SEXP res = PROTECT(Rf_allocVector(REALSXP, n));
double* res_p = REAL(res);
for (R_xlen_t i = 0; i < n; ++i) {
*res_p++ = REAL_ELT(x, i) * 100. + 1;
}
UNPROTECT(1);
return res;
}
// This uses C99 variable length arrays, but that is fine for testing
// [[Rcpp::export]]
double altrep_blocked(SEXP x, R_xlen_t BLOCK_SIZE = 64) {
R_xlen_t n = Rf_xlength(x);
double res = 0;
double buf[BLOCK_SIZE];
R_xlen_t i = 0;
while ((i + BLOCK_SIZE) < n) {
REAL_GET_REGION(x, i, BLOCK_SIZE, buf);
for (R_xlen_t j = 0; j < BLOCK_SIZE; ++j) {
res += buf[j];
}
i += BLOCK_SIZE;
}
R_xlen_t remaining = n - i;
// do the remainder
REAL_GET_REGION(x, i, n, buf);
for (R_xlen_t j = 0; j < remaining; ++j) {
res += buf[j];
}
return res;
}
class NormalNumericVector {
public:
NormalNumericVector(SEXP vec) : vec_(vec), start_(REAL(vec)) {}
double* begin() { return start_; }
double* end() { return start_ + size(); }
double operator*() { return *start_; }
double* operator++(int) {
++start_;
return start_;
}
R_xlen_t size() { return Rf_xlength(vec_); }
private:
SEXP vec_;
double* start_;
};
class AltrepNumericVector {
public:
AltrepNumericVector(SEXP vec) : vec_(vec), i_(0) {}
AltrepNumericVector begin() { return AltrepNumericVector(vec_); };
AltrepNumericVector end() {
AltrepNumericVector res(vec_);
res.i_ = size();
return res;
}
AltrepNumericVector& operator++() /* prefix */ {
++i_;
return *this;
}
bool operator!=(const AltrepNumericVector& other) { return i_ != other.i_; }
double operator*() { return REAL_ELT(vec_, i_); }
R_xlen_t size() { return Rf_xlength(vec_); }
private:
SEXP vec_;
R_xlen_t i_;
};
class AltrepNumericVectorBlock {
public:
AltrepNumericVectorBlock(SEXP vec, R_xlen_t block_size)
: vec_(vec),
i_(0),
j_(0),
size_(Rf_xlength(vec_)),
next_block_(std::min(block_size, size_)) {
buf.reserve(next_block_);
REAL_GET_REGION(vec_, i_, next_block_, buf.data());
}
AltrepNumericVectorBlock begin() {
return AltrepNumericVectorBlock(vec_, next_block_);
};
AltrepNumericVectorBlock end() {
AltrepNumericVectorBlock res(vec_, next_block_);
res.i_ = size();
return res;
}
AltrepNumericVectorBlock& operator++() /* prefix */ {
++j_;
if (j_ == next_block_) {
i_ += j_;
if ((i_ + next_block_) < size_) {
REAL_GET_REGION(vec_, i_, next_block_, buf.data());
} else {
next_block_ = size_ - i_;
if (next_block_ > 0) {
REAL_GET_REGION(vec_, i_, next_block_, buf.data());
}
}
j_ = 0;
}
return *this;
}
bool operator!=(const AltrepNumericVectorBlock& other) {
return !(i_ == other.i_ && j_ == other.j_);
}
double operator*() { return buf[j_]; }
R_xlen_t size() { return size_; }
private:
SEXP vec_;
R_xlen_t i_;
R_xlen_t j_;
R_xlen_t size_;
R_xlen_t next_block_;
std::vector<double> buf;
};
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
double normal_iterator1(SEXP x) {
auto v = NormalNumericVector(x);
double res = 0;
for (auto& x : v) {
res += x;
}
return res;
}
// [[Rcpp::export]]
double normal_iterator2(SEXP x) {
auto v = NormalNumericVector(x);
return std::accumulate(v.begin(), v.end(), 0.);
}
// [[Rcpp::export]]
double altrep_iterator1(SEXP x) {
auto v = AltrepNumericVector(x);
double res = 0;
for (auto&& x : v) {
res += x;
}
return res;
}
// [[Rcpp::export]]
double altrep_iterator2(SEXP x) {
auto v = AltrepNumericVector(x);
return std::accumulate(v.begin(), v.end(), 0.);
}
// [[Rcpp::export]]
double altrep_block_iterator1(SEXP x, R_xlen_t block_size = 64) {
auto v = AltrepNumericVectorBlock(x, block_size);
double res = 0;
for (auto&& x : v) {
res += x;
}
return res;
}
// [[Rcpp::export]]
double altrep_block_iterator2(SEXP x, R_xlen_t block_size = 64) {
auto v = AltrepNumericVectorBlock(x, block_size);
return std::accumulate(v.begin(), v.end(), 0.);
}
Altrep overhead is equivalent to calling REAL() on each iteration, however even that performance can be almost completely rescued by using the region interface and a small 64 byte buffer.
res <- bench::press(
n = 10 ^ (5:7),
{
set.seed(42)
x <- rnorm(n)
bench::mark(
normal_sum1(x),
normal_sum2(x),
normal_sum3(x),
altrep_sum(x),
altrep_blocked(x, 64),
normal_iterator1(x),
normal_iterator2(x),
altrep_iterator1(x),
altrep_iterator2(x),
altrep_block_iterator1(x),
altrep_block_iterator2(x)
)
}
)
to_keep <- c("expression", "n", "median", "itr/sec")
knitr::kable(res[to_keep])
| expression | n | median | itr/sec |
|---|---|---|---|
| normal_sum1(x) | 1e+05 | 88.69µs | 10645.91814 |
| normal_sum2(x) | 1e+05 | 88.73µs | 10924.97612 |
| normal_sum3(x) | 1e+05 | 252.61µs | 3854.38686 |
| altrep_sum(x) | 1e+05 | 256.06µs | 3811.60378 |
| altrep_blocked(x, 64) | 1e+05 | 88.44µs | 10896.82990 |
| normal_iterator1(x) | 1e+05 | 89.38µs | 10907.70075 |
| normal_iterator2(x) | 1e+05 | 88.59µs | 10970.03385 |
| altrep_iterator1(x) | 1e+05 | 260.25µs | 3738.49057 |
| altrep_iterator2(x) | 1e+05 | 261.76µs | 3712.33678 |
| altrep_block_iterator1(x) | 1e+05 | 214.12µs | 4534.40887 |
| altrep_block_iterator2(x) | 1e+05 | 110.84µs | 8785.61084 |
| normal_sum1(x) | 1e+06 | 863.53µs | 1148.67154 |
| normal_sum2(x) | 1e+06 | 869.49µs | 1136.20678 |
| normal_sum3(x) | 1e+06 | 2.55ms | 388.75374 |
| altrep_sum(x) | 1e+06 | 2.54ms | 391.10409 |
| altrep_blocked(x, 64) | 1e+06 | 873.3µs | 1129.33728 |
| normal_iterator1(x) | 1e+06 | 844.09µs | 1174.62185 |
| normal_iterator2(x) | 1e+06 | 846.24µs | 1163.75373 |
| altrep_iterator1(x) | 1e+06 | 2.54ms | 390.30531 |
| altrep_iterator2(x) | 1e+06 | 2.56ms | 386.61222 |
| altrep_block_iterator1(x) | 1e+06 | 2.16ms | 460.92093 |
| altrep_block_iterator2(x) | 1e+06 | 1.05ms | 943.12344 |
| normal_sum1(x) | 1e+07 | 8.57ms | 116.76921 |
| normal_sum2(x) | 1e+07 | 8.65ms | 115.33691 |
| normal_sum3(x) | 1e+07 | 25.65ms | 38.89224 |
| altrep_sum(x) | 1e+07 | 25.65ms | 39.17382 |
| altrep_blocked(x, 64) | 1e+07 | 8.77ms | 113.64278 |
| normal_iterator1(x) | 1e+07 | 8.56ms | 116.16466 |
| normal_iterator2(x) | 1e+07 | 8.53ms | 116.48583 |
| altrep_iterator1(x) | 1e+07 | 25.52ms | 39.14110 |
| altrep_iterator2(x) | 1e+07 | 25.48ms | 39.06390 |
| altrep_block_iterator1(x) | 1e+07 | 21.45ms | 46.55944 |
| altrep_block_iterator2(x) | 1e+07 | 10.72ms | 93.18537 |
plot(res)
The optimal buffersize can be explored as well (64 bytes seems optimal for this case).
res <- bench::press(
n = 10 ^ 7,
buf_size = 2 ^ (4:14),
{
set.seed(42)
x <- rnorm(n)
bench::mark(
altrep_block_iterator2(x, buf_size)
)
}
)
to_keep <- c("buf_size", "median", "itr/sec")
knitr::kable(res[to_keep])
| buf_size | median | itr/sec |
|---|---|---|
| 16 | 12.5ms | 69.13854 |
| 32 | 12.2ms | 81.55934 |
| 64 | 11ms | 90.48455 |
| 128 | 11.5ms | 85.40311 |
| 256 | 12.2ms | 81.39517 |
| 512 | 12.8ms | 77.47887 |
| 1024 | 12.9ms | 76.85149 |
| 2048 | 12.8ms | 77.76877 |
| 4096 | 13.3ms | 75.10114 |
| 8192 | 13.2ms | 74.63072 |
| 16384 | 13.9ms | 71.17870 |
Rcpp::sourceCpp("altrep.cc")