Packages needed on this document.
library(Rcpp) #For running C++ inside R
library(tidyr) #For easily tidy data
library(dplyr, warn.conflicts = FALSE) #data manipulation
library(ggplot2) #graphical environment
library(microbenchmark) #For computing processing time
library(knitr) #table plot
C++ using the package “Rcpp”.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector rowSumsC(NumericMatrix x) {
int nrow = x.nrow(), ncol = x.ncol();
NumericVector out(nrow);
for (int i = 0; i < nrow; i++) {
double total = 0;
for (int j = 0; j < ncol; j++) {
total += x(i, j);
}
out[i] = total;
}
return out;
}
In all the codes below, big.matrix is an matrix argument and N is an integer that tells us the number of times to evaluate the expression. Both will be inserted by the user.
cppSum <- function(big.matrix, N) {
stopifnot(exists("big.matrix"))
stopifnot(exists("N"))
Time <- microbenchmark({
colsums <- rowSumsC(big.matrix)
}, times = N)$time
return(Time)
}
By using the optimized “colSums” function from R:
ColSum <- function(big.matrix, N) {
stopifnot(exists("big.matrix"))
stopifnot(exists("N"))
Time <- microbenchmark({
colsums <- colSums(big.matrix)
}, times = N)$time
return(Time)
}
By using the “apply” function from R:
Apply <- function(big.matrix, N) {
stopifnot(exists("big.matrix"))
stopifnot(exists("N"))
Time <- microbenchmark({
colsums <- apply(big.matrix, 2, sum)
}, times = N)$time
return(Time)
}
By using matricidal multiplication:
matrixSum <- function(big.matrix, N) {
stopifnot(exists("big.matrix"))
stopifnot(exists("N"))
Time <- microbenchmark({
ones <- rep(1, dim(big.matrix)[1])
colsums <- ones %*% big.matrix
}, times = N)$time
return(Time)
}
The single loop function:
SingleLoop <- function(big.matrix, N) {
stopifnot(exists("big.matrix"))
stopifnot(exists("N"))
colsums <- rep(NA, dim(big.matrix)[2])
Time <- microbenchmark({
for (i in 1:dim(big.matrix)[2]){
colsums[i] <- sum(big.matrix[,i])
}
}, times = N)$time
return(Time)
}
The double loop function:
DoubleLoop <- function(big.matrix, N) {
stopifnot(exists("big.matrix"))
stopifnot(exists("N"))
Time <- microbenchmark({
colsums <- rep(NA, dim(big.matrix)[2])
for (i in 1:dim(big.matrix)[2]){
s <- 0
for (j in 1:dim(big.matrix)[1]){
s <- s + big.matrix[j, i]
}
colsums[i] <- s
}
}, times = N)$time
return(Time)
}
The codes we have built previously are used to sum the columns of a matrix. Now, we are wondering in which function we spent more time on process. For this we are going to create a random process to evaluate the performance of each function.
Trials = 15 # Number of experiments
Experiments <- as.data.frame(matrix(NA, nrow= Trials, ncol=5)) # Initializing an empty dataframe
colnames(Experiments)<-c("CppSum", "SingleLoop", "Apply", "ColSum", "MatrixSum") # Changing the name of the columns
for (n in 1:Trials) {
big.matrix <- matrix(runif(1e+6), nrow = 1000)
Experiments[n,] = c(cppSum(big.matrix, N = 1), # FILLING THE EMPTY DATAFRAME
SingleLoop(big.matrix, N = 1),
Apply(big.matrix, N = 1),
ColSum(big.matrix, N = 1),
matrixSum(big.matrix, N = 1))
}
It follows the experiments table below, which one has the time spent for processing time in each function built in 15 trials:
| CppSum | SingleLoop | Apply | ColSum | MatrixSum |
|---|---|---|---|---|
| 4040703 | 11689766 | 15646863 | 1444005 | 2238157 |
| 5231005 | 8943803 | 15052799 | 1499768 | 2256922 |
| 3335054 | 8874590 | 12131814 | 1452291 | 2253763 |
| 3358749 | 8872080 | 12098386 | 1478929 | 2188823 |
| 3417912 | 8916909 | 12121692 | 1535037 | 2204139 |
| 3349052 | 8853597 | 12725393 | 1510674 | 2213582 |
| 3362028 | 8878595 | 13663631 | 1454926 | 2224903 |
| 3342560 | 9187235 | 12129168 | 1565688 | 2179259 |
| 3336599 | 9450264 | 12148831 | 1452273 | 2503473 |
| 3748137 | 9656435 | 12220309 | 1566774 | 2298531 |
| 4827219 | 8961730 | 12320469 | 2187986 | 2218131 |
| 3601511 | 8964148 | 12146970 | 1421873 | 2283817 |
| 4884742 | 9399076 | 13867868 | 1439750 | 2179974 |
| 3830910 | 8916469 | 12265953 | 1422842 | 2266107 |
| 4824674 | 9737731 | 12238797 | 1519767 | 2238111 |
Now, there is the boxplot between the variables time spent and the method used to manipulate the operations on the matrix, where we can easily see that the code spent most time running the Apply function. The SigleLoop function also has a big percent of our processing time.
p <- Experiments %>% gather("Method", "Time") %>% ggplot(aes(x = reorder(Method, Time, FUN=median), y = Time, fill = Method))
p + geom_boxplot() + labs(x = "Method", y = "Time in nanoseconds")