The idea to make the set of the papers for R script enhancement and advanced performance have arisen due to:
Content
- vectorization
- fast data reading
- fast data writing
- script enhancement
fast reading data
The overview of fast reading options see here
fast writing data
The overview of fast reading options see here
R script enhancements
Vectorise and pre-allocate data structures
Examples are taken from here
# Create the data frame
col1 <- runif (12^5, 0, 2)
col2 <- rnorm (12^5, 0, 2)
col3 <- rpois (12^5, 3)
col4 <- rchisq (12^5, 2)
df <- data.frame (col1, col2, col3, col4)
The logic we are about to optimise:
For every row on this data frame (df), check if the sum of all values is greater than 4.
If it is, a new 5th variable gets the value “greater_than_4”, else, it gets “lesser_than_4”.
# Original R code: Before vectorization and pre-allocation
system.time({
for (i in 1:nrow(df)) { # for every row
if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4
df[i, 5] <- "greater_than_4" # assign 5th column
} else {
df[i, 5] <- "lesser_than_4" # assign 5th column
}
}
})
user system elapsed
936.58 62.32 1233.08
df[,5] <- NULL
# after vectorization and pre-allocation
output <- character (nrow(df)) # initialize output vector for vectorization
# loop with preallocated vector
system.time({
for (i in 1:nrow(df)) {
if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) {
output[i] <- "greater_than_4" #looping through the vector but not a dataframe
} else {
output[i] <- "lesser_than_4" #looping through the vector but not a dataframe
}
}
# actual vectorization: in place of row-by-row transactions we manipulate the single vector only.
df$output <- output
}
)
user system elapsed
31.77 0.07 42.01
df[,5] <- NULL
Place IF statements (expressions) outside the loop.
Examples are taken from here
# initialize output vector for vectorization
output <- character (nrow(df))
# the expression in IF statement
condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4 # condition check outside the loop
system.time({
for (i in 1:nrow(df)) {
if (condition[i]) {
output[i] <- "greater_than_4"
} else {
output[i] <- "lesser_than_4"
}
}
df$output <- output
})
user system elapsed
1.61 0.00 2.53
df[,5] <- NULL
The LOOP only for True conditions
Examples are taken from here
# initialize output vector for vectorization
output <- character (nrow(df))
# the expression in IF statement
condition <- (df$col1 + df$col2 + df$col3 + df$col4) > 4 # condition check outside the loop
system.time({
for (i in (1:nrow(df))[condition]) { # run loop only for true conditions
if (condition[i]) {
output[i] <- "greater_than_4"
} else {
output[i] <- "lesser_than_4"
}
}
# actual vectorization: in place of row-by-row transactions we manipulate the single vector only.
df$output <- output
})
user system elapsed
0.39 0.00 0.41
df[,5] <- NULL
Use ifelse() whenever possible
Examples are taken from here
You can make this logic much simpler and faster by using the ifelse() statement.
Looks like this is going to be a highly preferred option to speed up simple loops.
system.time({
output <- ifelse ((df$col1 + df$col2 + df$col3 + df$col4) > 4, "greater_than_4", "lesser_than_4")
df$output <- output
})
user system elapsed
0.17 0.00 0.17
df[,5] <- NULL
Using which()
Examples are taken from here
system.time({
want = which(rowSums(df) > 4)
output = rep("less than 4", times = nrow(df))
output[want] = "greater than 4"
df$output <- output
})
user system elapsed
0.02 0.00 0.02
df[,5] <- NULL
Use apply family of functions instead of for-loops
Examples are taken from here
# apply family
system.time({
myfunc <- function(x) {
if ((x['col1'] + x['col2'] + x['col3'] + x['col4']) > 4) {
"greater_than_4"
} else {
"lesser_than_4"
}
}
output <- apply(df[, c(1:4)], 1, FUN=myfunc) # apply 'myfunc' on every row
df$output <- output
})
user system elapsed
3.01 0.02 3.85
df[,5] <- NULL
Use byte code compilation for functions cmpfun()
We will use initial dataframe as the input and the initial FOR loop (as in the first example).
f <- function(df){
for (i in 1:nrow(df)) { # for every row
if ((df[i, 'col1'] + df[i, 'col2'] + df[i, 'col3'] + df[i, 'col4']) > 4) { # check if > 4
df[i, 5] <- "greater_than_4" # assign 5th column
} else {
df[i, 5] <- "lesser_than_4" # assign 5th column
}
}
}
library(compiler)
g <- cmpfun(f)
system.time(
g(df)
)
user system elapsed
558.29 254.10 872.24
df[,5] <- NULL
