Optimizing R code with Rcpp

library(Rcpp)
library(microbenchmark)

## Warning: package 'microbenchmark' was built under R version 4.0.3

x <- rnorm(50000)
# Define the function sum_loop
sum_loop <- function(x) {
  result <- 0
  for(i in x){
    result <- result + i
  }
  result
}

# Check for equality 
all.equal(sum_loop(x), sum(x))

## [1] TRUE

# Compare the performance
microbenchmark(sum_loop = sum_loop(x), R_sum = sum(x))

## Unit: microseconds
##      expr      min       lq       mean   median        uq       max neval
##  sum_loop 1077.300 1126.651 1802.74105 1225.302 1529.3005 12858.501   100
##     R_sum   44.701   46.102   64.12512   53.651   68.7515   147.901   100

Data types in C++

# Evaluate 2 + 2 in C++
x <- evalCpp("2+2")

# Evaluate 2 + 2 in R
y <- 2 + 2

# Storage modes of x and y
storage.mode(x)

## [1] "integer"

storage.mode(y)

## [1] "double"

# Change the C++ expression so that it returns a double
z <- evalCpp("(double)2 + 2")


# Evaluate 17 / 2 in C++
evalCpp("17/2")

## [1] 8

# Cast 17 to a double and divide by 2
evalCpp("(double)17 / 2")

## [1] 8.5

# Cast 56.3 to an int
evalCpp("(int)56.3")

## [1] 56

C++ function with Rcpp

In between the C++ code and the R function there are many layers of engineering For c++ function we must declare the type of each argument and the objects it returns.

# Save temporary C++ file
cppFunction(" int fun(){
  int x = 37;
  return x;
}")

fun()

## [1] 37

Sum function with R and C++

cppFunction("
  double add( double x, double y){
    double res = x + y;
    return res;
  }
")


addr <- function(x, y){
  res <- x + y
  res
}

microbenchmark(cpp = add(20,25), R_sum = addr(20 , 25))

## Unit: nanoseconds
##   expr  min   lq     mean median     uq     max neval
##    cpp 1101 1201 11574.95   1301 1600.5  976900   100
##  R_sum  300  401 14061.02    401  501.0 1352202   100

Euclidian function example

# Define the function euclidean_distance()
cppFunction('
  double euclidean_distance(double x, double y) {
    return sqrt(x*x + y*y) ;
  }
')

# Calculate the euclidean distance
euclidean_distance(1.5, 2.5)

## [1] 2.915476

Comments in functions

# Define the function add()
cppFunction('
  int add(int x, int y) {
    int res = x + y ;
    Rprintf("** %d + %d = %d\\n", x, y, res) ;
    return res ;
  }
')

# Call add() to print THE answer
add(40, 2)

## ** 40 + 2 = 42

## [1] 42

Handling errors in c++

cppFunction('
  // adds x and y, but only if they are positive
  int add_positive_numbers(int x, int y) {
      // if x is negative, stop
      if(x < 0) stop("x is negative") ;
    
      // if y is negative, stop
      if(y < 0) stop("y is negative") ;
     
      return x + y ;
  }
')

# Call the function with positive numbers
add_positive_numbers(2, 3)

## [1] 5

C++ functions from c++ files

// [[Rcpp::Export]] important to compile c++ code in R. Always put when create a function code.cpp

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int timeTwo( int x ){
  return 2*x
}

Call c++ script

library(Rcpp)
sourceCpp("code.cpp")
timesTwo( 21 )

more complex functions in C++

In this case only universal() function is exported

#include <Rcpp.h>
using namespace Rcpp;

int twice( int x ){
 return 2*x
}

// {{Rcpp::export}}
int universal(){
  return twice(21)
}

Compiling R code in c++ script

#include <Rcpp.h>
using namespace Rcpp;

int twice( int x ){
 return 2*x
}

// {{Rcpp::export}}
int universal(){
  return twice(21)
}

/*** R
  # This is R code
  12 + 30
  
  universal()
*/

Other C++ example

// Include the Rcpp.h header
#include <Rcpp.h>

// Use the Rcpp namespace
using namespace Rcpp;

// [[Rcpp::export]]
int the_answer() {
    // Return 42
    return 42;
}

/*** R
# Call the_answer() to check you get the right result
the_answer()
*/

Distance function y C++


#include <Rcpp.h>
using namespace Rcpp; 

// Make square() accept and return a double
double square(double x) {
  // Return x times x
  return x*x ;
}

// [[Rcpp::export]]
double dist(double x, double y) {
  // Change this to use square()
  return sqrt(x + y);
}

/*** R
# Call dist() to the point (3, 4)
  dist(3, 4)
# Close the Rcpp R comment block
*/

Absolute function in C++


#include <Rcpp.h>
using namespace Rcpp ;

// [[Rcpp::export]]
double absolute(double x) {
  // Test for x greater than zero
  if(x > 0) {
    // Return x
    return x ;
  // Otherwise
  } else {
    // Return negative x
    return -x;
  }
}

/*** R  
absolute(pi)
absolute(-3)
*/

For loops in C++

Babilonian method: \[ x_{n+1} = \frac{1}{2}( x_n + \frac{S}{x_{n}} ) \]

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double sqrt_approx(double value, int n) {
    // Initialize x to be one
    double x = 1;
    // Specify the for loop
    for(int i = 0; i<n; i++) {
        x = (x + value / x) / 2.0;
    }
    return x;
}

/*** R
sqrt_approx(2, 10)
*/

Square root with early stop by approximation

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
List sqrt_approx(double value, int n, double threshold) {
    double x = 1.0;
    double previous = x;
    bool is_good_enough = false;
    int i;
    for(i = 0; i < n; i++) {
        previous = x;
        x = (x + value / x) / 2.0;
        is_good_enough = fabs(previous - x) < threshold;
        
        // If the solution is good enough, then "break"
        if(is_good_enough) break;
    }
    return List::create(_["i"] = i , _["x"] = x);
}

/*** R
sqrt_approx(2, 1000, 0.1)
*/

Rcpp clases and vectors

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double first_plus_last(NumericVector x) {
    // The size of x
    int n = x.size();
    // The first element of x
    double first = x[0];
    // The last element of x
    double last = x[n - 1];
    return first + last;
}

/*** R
x <- c(6, 28, 496, 8128)
first_plus_last(x)
# Does the function give the same answer as R?
all.equal(first_plus_last(x), x[1] + x[4])
*/

Sum of values of a vector

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double sum_cpp(NumericVector x) {
  // The size of x
  int n = x.size();
  // Initialize the result
  double result = 0;
  // Complete the loop specification
  for(int i = 0; i<n; i++) {
    // Add the next value
    result = result + x[i];
  }
  return result;
}

/*** R
set.seed(42)
x <- rnorm(1e6)
sum_cpp(x)
# Does the function give the same answer as R's sum() function?
all.equal(sum_cpp(x), sum(x))
*/

Create sequence function in C++

#include <Rcpp.h>
using namespace Rcpp;

// Set the return type to IntegerVector
// [[Rcpp::export]]
IntegerVector seq_cpp(int lo, int hi) {
  int n = hi - lo + 1;
    
  // Create a new integer vector, sequence, of size n
  IntegerVector sequence(n);
    
  for(int i = 0; i < n; i++) {
    // Set the ith element of sequence to lo plus i
    sequence[i] = lo + i;
  }
  
  return sequence;
}

/*** R
lo <- -2
hi <- 5
seq_cpp(lo, hi)
# Does it give the same answer as R's seq() function?
all.equal(seq_cpp(lo, hi), seq(lo, hi))
*/

Create more vectores

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
List create_vectors() {
  // Create an unnamed character vector
  CharacterVector polygons = CharacterVector::create("triangle", "square", "pentagon"); 
  // Create a named integer vector
  IntegerVector mersenne_primes = IntegerVector::create(_["first"] = 3, _["second"] = 7, _["third"] = 31);
  // Create a named list
  List both = List::create(_["polygons"] = polygons, _["mersenne_primes"] = mersenne_primes);
  return both;
}

/*** R
create_vectors()
*/

Weighted mean in C ++

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double weighted_mean_cpp(NumericVector x, NumericVector w) {
  double total_w = 0;
  double total_xw = 0;
  
  int n = x.size();
  
  for(int i = 0; i < n; i++) {
    // If the ith element of x or w is NA then return NA
    if(NumericVector::is_na(x[i]) || NumericVector::is_na(w[i])) {
      return NumericVector::get_na();
    } 
    total_w += w[i];
    total_xw += x[i] * w[i];
  }
  
  return total_xw / total_w;
}

/*** R 
x <- c(0, 1, 3, 6, 2, 7, 13, NA, 12, 21, 11)
w <- 1 / seq_along(x)
weighted_mean_cpp(x, w)
*/
*/

Vectors with sTL (standard template library)

More efficient than Rcpp. Handles memory without making copies.

The standard template library (stl) is a C++ library containing flexible algorithms and data structures. For example, the double vector from the stl is like a “native C++” equivalent of Rcpp’s NumericVector. the following code creates a standard double vector named x with ten elements.

Scalar rancom number generation


#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector positive_rnorm(int n, double mean, double sd) {
  // Specify out as a numeric vector of size n
  NumericVector out(n);
  
  // This loops over the elements of out
  for(int i = 0; i < n; i++) {
    // This loop keeps trying to generate a value
    do {
      // Call R's rnorm()
      out[i] = R::rnorm(mean, sd);
      // While the number is negative, keep trying
    } while(out[i] <= 0);
  }
  return out;
}

/*** R
  positive_rnorm(10, 2, 2)
*/

Sampling from a mixture of distributions

#include <Rcpp.h>
using namespace Rcpp;

// From previous exercise; do not modify
// [[Rcpp::export]]
int choose_component(NumericVector weights, double total_weight) {
  double x = R::runif(0, total_weight);
  int j = 0;
  while(x >= weights[j]) {
    x -= weights[j];
    j++;
  }
  return j;
}

// [[Rcpp::export]]
NumericVector rmix(int n, NumericVector weights, NumericVector means, NumericVector sds) {
  // Check that weights and means have the same size
  int d = weights.size();
  if(means.size() != d) {
    stop("means size != weights size");
  }
  // Do the same for the weights and std devs
  if(sds.size() != d) {
    stop("sds size != weights size");
  }
  
  // Calculate the total weight
  double total_weight = sum(weights);
  
  // Create the output vector
  NumericVector res(n);
  
  // Fill the vector
  for(int i = 0; i < n; i++) {
    // Choose a component
    int j = choose_component(weights, total_weight);
    
    // Simulate from the chosen component
    res[i] = R::rnorm(means[j], sds[j]);
  }
  
  return res;
}

/*** R
  weights <- c(0.3, 0.7)
  means <- c(2, 4)
  sds <- c(2, 4)
  rmix(10, weights, means, sds)
*/

Rolling mean comparison

rollmean1 <- function(x, window = 3) {
  n <- length(x)
  res <- rep(NA, n)
  for(i in seq(window, n)) {
    res[i] <- mean(x[seq(i - window + 1, i)])
  }
  res
}


rollmean2 <- function(x, window = 3){
  n <- length(x)
  res <- rep(NA, n)
  total <- sum(head(x, window))
  res[window] <- total / window
  for(i in seq(window + 1, n)) {
    total <- total + x[i] - x[i - window]
    res[i] <- total / window
  }
  res
}

# Complete the definition of rollmean3()
rollmean3 <- function(x, window = 3) {
  # Add the first window elements of x
  initial_total <- sum(head(x, window))

  # The elements to add at each iteration    
  lasts <- tail(x, - window)
  
  # The elements to remove
  firsts <- head(x, - window)

  # Take the initial total and add the 
  # cumulative sum of lasts minus firsts
  other_totals <- initial_total + cumsum(lasts - firsts)

  # Build the output vector 
  c(
    rep(NA, window - 1),    # leading NA
    initial_total / window, # initial mean
    other_totals / window   # other means
  )
}


#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector rollmean4(NumericVector x, int window) {
  int n = x.size();
  
  // Set res as a NumericVector of NAs with length n
  NumericVector res(n, NumericVector::get_na());
  
  // Sum the first window worth of values of x
  double total = 0.0;
  for(int i = 0; i < window; i++) {
    total += x[i];
  }
  
  // Treat the first case seperately
  res[window - 1] = total / window;
  
  // Iteratively update the total and recalculate the mean  
  for(int i = window; i < n; i++) {
    // Remove the (i - window)th case, and add the ith case
    total += - x[i - window] + x[i];
    // Calculate the mean at the ith position
    res[i] = total / window;
  }
  
  return res;  
}

/*** R
   # Compare rollmean2, rollmean3 and rollmean4   
   set.seed(42)
   x <- rnorm(10000)
   microbenchmark( 
    rollmean2(x, 4), 
    rollmean3(x, 4), 
    rollmean4(x, 4), 
    times = 5
   )   
*/