The idea to make the set of the papers for R script enhancement and advanced performance have arisen due to:

Content

vectorization

The brief example of see here - 2.3.2 Example of vectorization

fast reading data

The overview of fast reading options see here

fast writing data

The overview of fast reading options see here

R script enhancements

Vectorise and pre-allocate data structures

Examples are taken from here

# Create the data frame
col1 <- runif (12^5, 0, 2)
col2 <- rnorm (12^5, 0, 2)
col3 <- rpois (12^5, 3)
col4 <- rchisq (12^5, 2)
df <- data.frame (col1, col2, col3, col4)

The logic we are about to optimise:
For every row on this data frame (df), check if the sum of all values is greater than 4.

If it is, a new 5th variable gets the value “greater_than_4”, else, it gets “lesser_than_4”.

# Original R code: Before vectorization and pre-allocation
system.time({
  for (i in 1:nrow(df)) { # for every row
    if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4
      df[i, 5] <- "greater_than_4" # assign 5th column
    } else {
      df[i, 5] <- "lesser_than_4" # assign 5th column
    }
  }
})
   user  system elapsed 
 936.58   62.32 1233.08 
df[,5] <- NULL
# after vectorization and pre-allocation
output <- character (nrow(df)) # initialize output vector for vectorization
# loop with preallocated vector
system.time({
  for (i in 1:nrow(df)) {
    if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) {
      output[i] <- "greater_than_4" #looping through the vector but not a dataframe
    } else {
      output[i] <- "lesser_than_4" #looping through the vector but not a dataframe
    }
  }
  
# actual vectorization: in place of row-by-row transactions we manipulate the single vector only.  
df$output <- output 
}
)
   user  system elapsed 
  31.77    0.07   42.01 
df[,5] <- NULL

Place IF statements (expressions) outside the loop.

Examples are taken from here

# initialize output vector for vectorization
output <- character (nrow(df))
# the expression in IF statement
condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4  # condition check outside the loop
system.time({
  for (i in 1:nrow(df)) {
    if (condition[i]) {
      output[i] <- "greater_than_4"
    } else {
      output[i] <- "lesser_than_4"
    }
  }
  df$output <- output
})
   user  system elapsed 
   1.61    0.00    2.53 
df[,5] <- NULL

The LOOP only for True conditions

Examples are taken from here

# initialize output vector for vectorization
output <- character (nrow(df))
# the expression in IF statement
condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4  # condition check outside the loop
system.time({
  for (i in (1:nrow(df))[condition]) {  # run loop only for true conditions
    if (condition[i]) {
      output[i] <- "greater_than_4"
    } else {
      output[i] <- "lesser_than_4"
    }
  }
# actual vectorization: in place of row-by-row transactions we manipulate the single vector only.    
df$output <- output
})
   user  system elapsed 
   0.39    0.00    0.41 
df[,5] <- NULL

Use ifelse() whenever possible

Examples are taken from here

You can make this logic much simpler and faster by using the ifelse() statement.

Looks like this is going to be a highly preferred option to speed up simple loops.

system.time({
  output <- ifelse ((df$col1 + df$col2 + df$col3 + df$col4) > 4, "greater_than_4", "lesser_than_4")
  df$output <- output
})
   user  system elapsed 
   0.17    0.00    0.17 
df[,5] <- NULL

Using which()

Examples are taken from here

system.time({
  want = which(rowSums(df) > 4)
  output = rep("less than 4", times = nrow(df))
  output[want] = "greater than 4"
  df$output <- output
}) 
   user  system elapsed 
   0.02    0.00    0.02 
df[,5] <- NULL

Use apply family of functions instead of for-loops

Examples are taken from here

# apply family
system.time({
  myfunc <- function(x) {
    if ((x['col1'] + x['col2'] + x['col3'] + x['col4']) > 4) {
      "greater_than_4"
    } else {
      "lesser_than_4"
    }
  }
  output <- apply(df[, c(1:4)], 1, FUN=myfunc)  # apply 'myfunc' on every row
  df$output <- output
})
   user  system elapsed 
   3.01    0.02    3.85 
df[,5] <- NULL

Use byte code compilation for functions cmpfun()

We will use initial dataframe as the input and the initial FOR loop (as in the first example).

f <- function(df){
   for (i in 1:nrow(df)) { # for every row
    if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4
      df[i, 5] <- "greater_than_4" # assign 5th column
    } else {
      df[i, 5] <- "lesser_than_4" # assign 5th column
    }
  }
}
library(compiler)
g <- cmpfun(f)
system.time(
  g(df)
)
   user  system elapsed 
 558.29  254.10  872.24 
df[,5] <- NULL
