This is a benchmark between map,lapply, foreach, and for in R.
lapply_colmean=function(data){
# this calculates column mean value since dataframes are comprised of lists.
unlist(lapply(data,mean))
}
apply_colmean=function(data){
apply(data,2,mean)
}
map_colmean=function(data){
purrr::map_dbl(data,mean)
}
foreach_colmean=function(data){
library(foreach)
foreach(i=1:dim(data)[2],.combine = c)%do%{mean(data[,i])}
}
for_colmean=function(data){
mean_vector=c(NA,length=ncol(data))
for(i in 1:ncol(data)){
mean_vector[i]<-mean(data[,i])
}
return(mean_vector)
}
vapply_colmean=function(data){
vapply(data,mean,numeric(1))
}
test_dataframe=data.frame(1:10,1:10,1:10)
lapply_colmean(test_dataframe)
## X1.10 X1.10.1 X1.10.2
## 5.5 5.5 5.5
apply_colmean(test_dataframe)
## X1.10 X1.10.1 X1.10.2
## 5.5 5.5 5.5
map_colmean(test_dataframe)
## X1.10 X1.10.1 X1.10.2
## 5.5 5.5 5.5
foreach_colmean(test_dataframe)
## [1] 5.5 5.5 5.5
for_colmean(test_dataframe)
## length
## 5.5 5.5 5.5
vapply_colmean(test_dataframe)
## X1.10 X1.10.1 X1.10.2
## 5.5 5.5 5.5
library(microbenchmark)
row_num=1000
col_num=100
library(magrittr)
library(ggplot2)
random_dataframe_colmean=matrix(rnorm(row_num*col_num),nrow=row_num,ncol=col_num)%>%as.data.frame()
microbenchmark(
lapply_colmean(random_dataframe_colmean)
,vapply_colmean(random_dataframe_colmean)
,apply_colmean(random_dataframe_colmean)
,map_colmean(random_dataframe_colmean)
,foreach_colmean(random_dataframe_colmean)
,for_colmean(random_dataframe_colmean)
,colMeans(random_dataframe_colmean)
,times = 1000
)%>%autoplot
When the number of Columns is smaller than the number of rows(non-high dimensional data): 1) The rank of different functions is vapply>lapply > colMeans > map > for > apply >> foreach. 2) The exceedingly slow speed of foreach is kind of surprising to me. 3) lapply is such a fast functional and it is faster than the colMeans function, which is dedicated to calculate the column mean. It is even faster then the simple C I wrote. (Add Rcpp here later).
Next, let us see what happens if the dimensions of the dataframes changes.
library(microbenchmark)
library(magrittr)
library(ggplot2)
# transpose the previous matrix.
random_dataframe_colmean_t=t(random_dataframe_colmean)
microbenchmark(
lapply_colmean(random_dataframe_colmean_t)
,vapply_colmean(random_dataframe_colmean_t)
,apply_colmean(random_dataframe_colmean_t)
,map_colmean(random_dataframe_colmean_t)
,foreach_colmean(random_dataframe_colmean_t)
,for_colmean(random_dataframe_colmean_t)
,colMeans(random_dataframe_colmean_t)
,times = 1000
)%>%autoplot
When the number of Columns is bigger than the number of rows (high dimensional data): 1) the speed of different functions is colMeans >> for \(\approx\) apply > foreach > map > lapply \(\approx\) vapply. 2) the speed of lapply is the slowest in this situation.
Let us profile the code to see where the time is spent in the for_colmean calculation.
library(profvis)
row_num=10000
col_num=1000
library(magrittr)
library(ggplot2)
random_dataframe_colmean=matrix(rnorm(row_num*col_num),nrow=row_num,ncol=col_num)%>%as.data.frame()
profvis({
foreach_colmean=function(data){
library(foreach)
foreach(i=1:dim(data)[2],.combine = c)%do%{mean(data[,i])}
}
foreach_colmean(random_dataframe_colmean_t)
},interval=0.05)
The majority of the time is spent doing mean.default. If you want to speed things up, you should focus on how to improve the performance of calculating the mean for a vector.
Let us try a less trivial example. What if we want to calculate the mean squared difference from the xu,yu,zu values. The calculation would be in three steps: 1) calculate the lagged difference. 2) squared all the values 3) calculate the mean of the squared value and sum the mean in xu,yu,zu.
msd_test_data=data.frame(xu=1:1e1,yu=1:1e1,zu=1:1e1)
vanilla_msd=function(data,lag_val){
library(magrittr)
xu_lag_squared=diff(data$xu,lag=lag_val)%>%.^2
yu_lag_squared=diff(data$yu,lag=lag_val)%>%.^2
zu_lag_squared=diff(data$zu,lag=lag_val)%>%.^2
average_lag_msd=(xu_lag_squared+xu_lag_squared+xu_lag_squared)%>%mean.default
return(average_lag_msd)
}
lapply_msd=function(data,lag_val){
library(magrittr)
diffed_squared_data=lapply(data,function(x){diff(x,lag=lag_val)} )%>%as.data.frame()%>%.^2%>%as.data.frame()
averaged_lag_msd=lapply(diffed_squared_data,mean)%>%unlist%>%sum()
return(averaged_lag_msd)
}
vapply_msd=function(data,lag_val){
library(magrittr)
averaged_lag_msd=data%>%vapply(function(x){diff(x,lag=lag_val)},FUN.VALUE=numeric(nrow(data)-lag_val))%>%
.^2%>%as.data.frame()%>%
vapply(mean,FUN.VALUE = numeric(1))%>%sum
return(averaged_lag_msd)
}
apply_msd=function(data,lag_val){
library(magrittr)
averaged_lag_msd=data%>%apply(2,function(x){diff(x,lag=lag_val)})%>%.^2%>%as.data.frame%>%apply(2,mean)%>%unlist%>%sum
return(averaged_lag_msd)
}
map_msd=function(data,lag_val){
library(purrr)
library(magrittr)
data%>%map_df(function(x){diff(x,lag=lag_val)})%>%.^2%>%as.data.frame%>%map_dbl(mean)%>%sum
}
foreach_msd=function(data,lag_val){
library(foreach)
library(magrittr)
diff_square_data=foreach(i=1:3,.combine =c)%do%{
data[,i]%>%diff(lag=lag_val)%>%.^2%>%mean
}
return(diff_square_data%>%sum)
}
for_msd=function(data,lag_val){
library(magrittr)
xu_yu_zu_msd=c(NA,length=3)
for(i in 1:3){
xu_yu_zu_msd[i] <- data[,i]%>%diff(lag=lag_val)%>%.^2%>%mean
}
return(xu_yu_zu_msd%>%sum)
}
coldiff_msd=function(data,lag_val){
library(matrixStats)
return(data%>%as.matrix%>%colDiffs(lag=lag_val)%>%.^2%>%colMeans%>%sum)
}
# Check to see if they give the same results.
vanilla_msd(msd_test_data,lag_val=2)
## [1] 12
lapply_msd(msd_test_data,lag_val=2)
## [1] 12
vapply_msd(msd_test_data,lag_val=2)
## [1] 12
apply_msd(msd_test_data,lag_val = 2)
## [1] 12
map_msd(msd_test_data,lag_val = 2)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:magrittr':
##
## set_names
## The following objects are masked from 'package:foreach':
##
## accumulate, when
## [1] 12
foreach_msd(msd_test_data,lag_val = 2)
## [1] 12
for_msd(msd_test_data,lag_val = 2)
## [1] 12
coldiff_msd(msd_test_data,lag_val = 2)
## [1] 12
library(ggplot2)
library(microbenchmark)
msd_test_data=data.frame(xu=1:1e6,yu=1:1e6,zu=1:1e6)
microbenchmark(
vanilla_msd(msd_test_data,lag_val=2)
,lapply_msd(msd_test_data,lag_val=2)
,vapply_msd(msd_test_data,lag_val=2)
,apply_msd(msd_test_data,lag_val = 2)
,map_msd(msd_test_data,lag_val = 2)
,foreach_msd(msd_test_data,lag_val = 2)
,for_msd(msd_test_data,lag_val = 2)
,coldiff_msd(msd_test_data,lag_val = 2)
)%>%autoplot()
library(ggplot2)
library(microbenchmark)
msd_test_data=data.frame(xu=1:1e5,yu=1:1e5,zu=1:1e5)
microbenchmark(
vanilla_msd(msd_test_data,lag_val=2)
,lapply_msd(msd_test_data,lag_val=2)
,vapply_msd(msd_test_data,lag_val=2)
,apply_msd(msd_test_data,lag_val = 2)
,map_msd(msd_test_data,lag_val = 2)
,foreach_msd(msd_test_data,lag_val = 2)
,for_msd(msd_test_data,lag_val = 2)
,coldiff_msd(msd_test_data,lag_val = 2)
)%>%autoplot()
library(ggplot2)
library(microbenchmark)
msd_test_data=data.frame(xu=1:1e4,yu=1:1e4,zu=1:1e4)
microbenchmark(
vanilla_msd(msd_test_data,lag_val=2)
,lapply_msd(msd_test_data,lag_val=2)
,vapply_msd(msd_test_data,lag_val=2)
,apply_msd(msd_test_data,lag_val = 2)
,map_msd(msd_test_data,lag_val = 2)
,foreach_msd(msd_test_data,lag_val = 2)
,for_msd(msd_test_data,lag_val = 2)
,coldiff_msd(msd_test_data,lag_val = 2)
)%>%autoplot()
library(ggplot2)
library(microbenchmark)
msd_test_data=data.frame(xu=1:1e2,yu=1:1e2,zu=1:1e2)
microbenchmark(
vanilla_msd(msd_test_data,lag_val=2)
,lapply_msd(msd_test_data,lag_val=2)
,vapply_msd(msd_test_data,lag_val=2)
,apply_msd(msd_test_data,lag_val = 2)
,map_msd(msd_test_data,lag_val = 2)
,foreach_msd(msd_test_data,lag_val = 2)
,for_msd(msd_test_data,lag_val = 2)
,coldiff_msd(msd_test_data,lag_val = 2)
)%>%autoplot()
TL;DR
The performance of different algorithms changes with data dimension and data size.
If there is one algoithm dedicated to do something, that is most likely the fastest method.
Simpler code is most likely to be faster in R or Python.
Compare colMeans with rowMeans on tranposed dataframe
row_and_col_mean_data=matrix(rnorm(1e6),ncol=10)
row_and_col_mean_data_t=t(row_and_col_mean_data)
library(magrittr)
library(microbenchmark)
microbenchmark(
colMeans(row_and_col_mean_data)
,rowMeans(row_and_col_mean_data_t)
)%>%ggplot2::autoplot()
It is very clear that colMeans are faster than rowMeans.