#include "Rcpp.h"

// [[Rcpp::export]]
double normal_sum1(SEXP x) {
  double res = 0;
  double* loc = REAL(x);
  double* end = loc + Rf_xlength(x);

  while (loc != end) {
    res += *loc++;
  }
  return res;
}

// [[Rcpp::export]]
double normal_sum2(SEXP x) {
  R_xlen_t n = Rf_xlength(x);
  double res = 0;
  double* loc = REAL(x);
  for (R_xlen_t i = 0; i < n; ++i) {
    res += loc[i];
  }
  return res;
}

// [[Rcpp::export]]
double normal_sum3(SEXP x) {
  R_xlen_t n = Rf_xlength(x);
  double res = 0;
  for (R_xlen_t i = 0; i < n; ++i) {
    res += REAL(x)[i];
  }
  return res;
}

// [[Rcpp::export]]
SEXP normal_times_100(SEXP x) {
  SEXP res = PROTECT(Rf_allocVector(REALSXP, Rf_xlength(x)));
  double* res_p = REAL(res);
  double* loc = REAL(x);
  double* end = loc + Rf_xlength(x);

  while (loc != end) {
    *res_p++ = *loc++ * 100. + 1;
  }
  UNPROTECT(1);
  return res;
}

// [[Rcpp::export]]
double altrep_sum(SEXP x) {
  R_xlen_t n = Rf_xlength(x);
  double res = 0;
  for (R_xlen_t i = 0; i < n; ++i) {
    res += REAL_ELT(x, i);
  }
  return res;
}

// [[Rcpp::export]]
SEXP altrep_times_100(SEXP x) {
  R_xlen_t n = Rf_xlength(x);
  SEXP res = PROTECT(Rf_allocVector(REALSXP, n));
  double* res_p = REAL(res);
  for (R_xlen_t i = 0; i < n; ++i) {
    *res_p++ = REAL_ELT(x, i) * 100. + 1;
  }
  UNPROTECT(1);
  return res;
}

// This uses C99 variable length arrays, but that is fine for testing
// [[Rcpp::export]]
double altrep_blocked(SEXP x, R_xlen_t BLOCK_SIZE = 64) {
  R_xlen_t n = Rf_xlength(x);
  double res = 0;
  double buf[BLOCK_SIZE];

  R_xlen_t i = 0;
  while ((i + BLOCK_SIZE) < n) {
    REAL_GET_REGION(x, i, BLOCK_SIZE, buf);
    for (R_xlen_t j = 0; j < BLOCK_SIZE; ++j) {
      res += buf[j];
    }
    i += BLOCK_SIZE;
  }
  R_xlen_t remaining = n - i;

  // do the remainder
  REAL_GET_REGION(x, i, n, buf);

  for (R_xlen_t j = 0; j < remaining; ++j) {
    res += buf[j];
  }

  return res;
}

class NormalNumericVector {
public:
  NormalNumericVector(SEXP vec) : vec_(vec), start_(REAL(vec)) {}
  double* begin() { return start_; }
  double* end() { return start_ + size(); }
  double operator*() { return *start_; }
  double* operator++(int) {
    ++start_;
    return start_;
  }
  R_xlen_t size() { return Rf_xlength(vec_); }

private:
  SEXP vec_;
  double* start_;
};

class AltrepNumericVector {
public:
  AltrepNumericVector(SEXP vec) : vec_(vec), i_(0) {}

  AltrepNumericVector begin() { return AltrepNumericVector(vec_); };
  AltrepNumericVector end() {
    AltrepNumericVector res(vec_);
    res.i_ = size();
    return res;
  }
  AltrepNumericVector& operator++() /* prefix */ {
    ++i_;
    return *this;
  }
  bool operator!=(const AltrepNumericVector& other) { return i_ != other.i_; }
  double operator*() { return REAL_ELT(vec_, i_); }
  R_xlen_t size() { return Rf_xlength(vec_); }

private:
  SEXP vec_;
  R_xlen_t i_;
};

class AltrepNumericVectorBlock {
public:
  AltrepNumericVectorBlock(SEXP vec, R_xlen_t block_size)
      : vec_(vec),
        i_(0),
        j_(0),
        size_(Rf_xlength(vec_)),
        next_block_(std::min(block_size, size_)) {
    buf.reserve(next_block_);
    REAL_GET_REGION(vec_, i_, next_block_, buf.data());
  }

  AltrepNumericVectorBlock begin() {
    return AltrepNumericVectorBlock(vec_, next_block_);
  };
  AltrepNumericVectorBlock end() {
    AltrepNumericVectorBlock res(vec_, next_block_);
    res.i_ = size();
    return res;
  }
  AltrepNumericVectorBlock& operator++() /* prefix */ {
    ++j_;
    if (j_ == next_block_) {
      i_ += j_;
      if ((i_ + next_block_) < size_) {
        REAL_GET_REGION(vec_, i_, next_block_, buf.data());
      } else {
        next_block_ = size_ - i_;
        if (next_block_ > 0) {
          REAL_GET_REGION(vec_, i_, next_block_, buf.data());
        }
      }
      j_ = 0;
    }
    return *this;
  }

  bool operator!=(const AltrepNumericVectorBlock& other) {
    return !(i_ == other.i_ && j_ == other.j_);
  }

  double operator*() { return buf[j_]; }
  R_xlen_t size() { return size_; }

private:
  SEXP vec_;
  R_xlen_t i_;
  R_xlen_t j_;
  R_xlen_t size_;
  R_xlen_t next_block_;
  std::vector<double> buf;
};

// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
double normal_iterator1(SEXP x) {
  auto v = NormalNumericVector(x);

  double res = 0;
  for (auto& x : v) {
    res += x;
  }
  return res;
}

// [[Rcpp::export]]
double normal_iterator2(SEXP x) {
  auto v = NormalNumericVector(x);

  return std::accumulate(v.begin(), v.end(), 0.);
}

// [[Rcpp::export]]
double altrep_iterator1(SEXP x) {
  auto v = AltrepNumericVector(x);

  double res = 0;
  for (auto&& x : v) {
    res += x;
  }
  return res;
}

// [[Rcpp::export]]
double altrep_iterator2(SEXP x) {
  auto v = AltrepNumericVector(x);

  return std::accumulate(v.begin(), v.end(), 0.);
}

// [[Rcpp::export]]
double altrep_block_iterator1(SEXP x, R_xlen_t block_size = 64) {
  auto v = AltrepNumericVectorBlock(x, block_size);

  double res = 0;
  for (auto&& x : v) {
    res += x;
  }
  return res;
}

// [[Rcpp::export]]
double altrep_block_iterator2(SEXP x, R_xlen_t block_size = 64) {
  auto v = AltrepNumericVectorBlock(x, block_size);

  return std::accumulate(v.begin(), v.end(), 0.);
}

Altrep overhead is equivalent to calling REAL() on each iteration, however even that performance can be almost completely rescued by using the region interface and a small 64 byte buffer.

res <- bench::press(
  n = 10 ^ (5:7),
  {
    set.seed(42)
    x <- rnorm(n)
    bench::mark(
      normal_sum1(x),
      normal_sum2(x),
      normal_sum3(x),
      altrep_sum(x),
      altrep_blocked(x, 64),
      normal_iterator1(x),
      normal_iterator2(x),
      altrep_iterator1(x),
      altrep_iterator2(x),
      altrep_block_iterator1(x),
      altrep_block_iterator2(x)
    )
  }
)

to_keep <- c("expression", "n", "median", "itr/sec")
knitr::kable(res[to_keep])
expression n median itr/sec
normal_sum1(x) 1e+05 88.69µs 10645.91814
normal_sum2(x) 1e+05 88.73µs 10924.97612
normal_sum3(x) 1e+05 252.61µs 3854.38686
altrep_sum(x) 1e+05 256.06µs 3811.60378
altrep_blocked(x, 64) 1e+05 88.44µs 10896.82990
normal_iterator1(x) 1e+05 89.38µs 10907.70075
normal_iterator2(x) 1e+05 88.59µs 10970.03385
altrep_iterator1(x) 1e+05 260.25µs 3738.49057
altrep_iterator2(x) 1e+05 261.76µs 3712.33678
altrep_block_iterator1(x) 1e+05 214.12µs 4534.40887
altrep_block_iterator2(x) 1e+05 110.84µs 8785.61084
normal_sum1(x) 1e+06 863.53µs 1148.67154
normal_sum2(x) 1e+06 869.49µs 1136.20678
normal_sum3(x) 1e+06 2.55ms 388.75374
altrep_sum(x) 1e+06 2.54ms 391.10409
altrep_blocked(x, 64) 1e+06 873.3µs 1129.33728
normal_iterator1(x) 1e+06 844.09µs 1174.62185
normal_iterator2(x) 1e+06 846.24µs 1163.75373
altrep_iterator1(x) 1e+06 2.54ms 390.30531
altrep_iterator2(x) 1e+06 2.56ms 386.61222
altrep_block_iterator1(x) 1e+06 2.16ms 460.92093
altrep_block_iterator2(x) 1e+06 1.05ms 943.12344
normal_sum1(x) 1e+07 8.57ms 116.76921
normal_sum2(x) 1e+07 8.65ms 115.33691
normal_sum3(x) 1e+07 25.65ms 38.89224
altrep_sum(x) 1e+07 25.65ms 39.17382
altrep_blocked(x, 64) 1e+07 8.77ms 113.64278
normal_iterator1(x) 1e+07 8.56ms 116.16466
normal_iterator2(x) 1e+07 8.53ms 116.48583
altrep_iterator1(x) 1e+07 25.52ms 39.14110
altrep_iterator2(x) 1e+07 25.48ms 39.06390
altrep_block_iterator1(x) 1e+07 21.45ms 46.55944
altrep_block_iterator2(x) 1e+07 10.72ms 93.18537
plot(res)

The optimal buffersize can be explored as well (64 bytes seems optimal for this case).

res <- bench::press(
  n = 10 ^ 7,
  buf_size = 2 ^ (4:14),
  {
    set.seed(42)
    x <- rnorm(n)
    bench::mark(
      altrep_block_iterator2(x, buf_size)
    )
  }
)
to_keep <- c("buf_size", "median", "itr/sec")
knitr::kable(res[to_keep])
buf_size median itr/sec
16 12.5ms 69.13854
32 12.2ms 81.55934
64 11ms 90.48455
128 11.5ms 85.40311
256 12.2ms 81.39517
512 12.8ms 77.47887
1024 12.9ms 76.85149
2048 12.8ms 77.76877
4096 13.3ms 75.10114
8192 13.2ms 74.63072
16384 13.9ms 71.17870
Rcpp::sourceCpp("altrep.cc")