Introduction

Packages needed on this document.

library(Rcpp)      #For running C++ inside R
library(tidyr)     #For easily tidy data
library(dplyr, warn.conflicts =  FALSE)  #data manipulation
library(ggplot2)   #graphical environment
library(microbenchmark)  #For computing processing time
library(knitr)     #table plot

Functions

C++ using the package “Rcpp”.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector rowSumsC(NumericMatrix x) {
  int nrow = x.nrow(), ncol = x.ncol();
  NumericVector out(nrow);

  for (int i = 0; i < nrow; i++) {
    double total = 0;
    for (int j = 0; j < ncol; j++) {
      total += x(i, j);
    }
    out[i] = total;
  }
  return out;
}

In all the codes below, big.matrix is an matrix argument and N is an integer that tells us the number of times to evaluate the expression. Both will be inserted by the user.

cppSum <- function(big.matrix, N) {
  stopifnot(exists("big.matrix"))
  stopifnot(exists("N"))
  Time <- microbenchmark({
    colsums <- rowSumsC(big.matrix)
  }, times = N)$time
  return(Time)  
}

By using the optimized “colSums” function from R:

ColSum <- function(big.matrix, N) {
  stopifnot(exists("big.matrix"))
  stopifnot(exists("N"))
  Time <- microbenchmark({
    colsums <- colSums(big.matrix)
  }, times = N)$time
  return(Time)  
}

By using the “apply” function from R:

Apply <- function(big.matrix, N) {
  stopifnot(exists("big.matrix"))
  stopifnot(exists("N"))
  Time <- microbenchmark({
    colsums <- apply(big.matrix, 2, sum)
  }, times = N)$time
  return(Time)  
}

By using matricidal multiplication:

matrixSum <- function(big.matrix, N) {
  stopifnot(exists("big.matrix"))
  stopifnot(exists("N"))
  Time <- microbenchmark({
    ones <- rep(1, dim(big.matrix)[1])
    colsums <- ones %*% big.matrix
  }, times = N)$time
  return(Time)  
}

The single loop function:

SingleLoop <- function(big.matrix, N) {
  stopifnot(exists("big.matrix"))
  stopifnot(exists("N"))
  colsums <- rep(NA, dim(big.matrix)[2])
  Time <- microbenchmark({
    for (i in 1:dim(big.matrix)[2]){
      colsums[i] <- sum(big.matrix[,i])
    }
  }, times = N)$time
  return(Time)
}

The double loop function:

DoubleLoop <- function(big.matrix, N) {
  stopifnot(exists("big.matrix"))
  stopifnot(exists("N"))
  Time <- microbenchmark({
  colsums <- rep(NA, dim(big.matrix)[2])    
      for (i in 1:dim(big.matrix)[2]){
        s <- 0
        for (j in 1:dim(big.matrix)[1]){
          s <- s + big.matrix[j, i]
        }
        colsums[i] <- s
      }
    }, times = N)$time
  return(Time)
} 

First experiment

The codes we have built previously are used to sum the columns of a matrix. Now, we are wondering in which function we spent more time on process. For this we are going to create a random process to evaluate the performance of each function.

Trials = 15 # Number of experiments
Experiments <- as.data.frame(matrix(NA, nrow= Trials, ncol=5)) # Initializing an empty dataframe
colnames(Experiments)<-c("CppSum", "SingleLoop", "Apply", "ColSum", "MatrixSum")  # Changing the name of the columns

for (n in 1:Trials) {                                       
  big.matrix <- matrix(runif(1e+6), nrow = 1000)
  Experiments[n,] = c(cppSum(big.matrix, N = 1),                 # FILLING THE EMPTY DATAFRAME
                      SingleLoop(big.matrix, N = 1), 
                      Apply(big.matrix, N = 1), 
                      ColSum(big.matrix, N = 1), 
                      matrixSum(big.matrix, N = 1))
}

It follows the experiments table below, which one has the time spent for processing time in each function built in 15 trials:

CppSum SingleLoop Apply ColSum MatrixSum
4040703 11689766 15646863 1444005 2238157
5231005 8943803 15052799 1499768 2256922
3335054 8874590 12131814 1452291 2253763
3358749 8872080 12098386 1478929 2188823
3417912 8916909 12121692 1535037 2204139
3349052 8853597 12725393 1510674 2213582
3362028 8878595 13663631 1454926 2224903
3342560 9187235 12129168 1565688 2179259
3336599 9450264 12148831 1452273 2503473
3748137 9656435 12220309 1566774 2298531
4827219 8961730 12320469 2187986 2218131
3601511 8964148 12146970 1421873 2283817
4884742 9399076 13867868 1439750 2179974
3830910 8916469 12265953 1422842 2266107
4824674 9737731 12238797 1519767 2238111

Now, there is the boxplot between the variables time spent and the method used to manipulate the operations on the matrix, where we can easily see that the code spent most time running the Apply function. The SigleLoop function also has a big percent of our processing time.

p  <-  Experiments  %>% gather("Method", "Time")  %>% ggplot(aes(x =  reorder(Method, Time, FUN=median), y = Time, fill = Method)) 
p + geom_boxplot() + labs(x = "Method", y = "Time in nanoseconds")