setwd("~/Dropbox/lammps/PMMA_big/atom600")
source("https://raw.githubusercontent.com/edwardcooper/mlmodel_select/master/timeRecord_functions.R")
MSD=function(data){
(data$xu-data$xu[1])^2+ (data$yu-data$yu[1])^2+ (data$zu-data$zu[1])^2
}
library(foreach)
library(magrittr)
library(microbenchmark)
timeRecordB()
## [1] "Fun time log has been created"
### Load the data
atom.600.1=read.table(file="atom.600_1.txt",header=FALSE, sep=" ", stringsAsFactors = FALSE, fill=TRUE
,col.names = c("atom-id","type","mol","xu","yu","zu")
,colClasses = c("numeric","numeric","numeric","numeric","numeric","numeric")
,na.strings = c("ITEM:","TIMESTEP","NUMBER","OF","ATOMS","BOX","BOUNDS", "pp","id","type","mol","xu","yu","zu")
)
timeRecordB(output_message = "Load in data")
# Clear out all rows with NAs.
atom.600.1=atom.600.1[complete.cases(atom.600.1),]
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 568111 30.4 940480 50.3 940480 50.3
## Vcells 1253425720 9562.9 2938853014 22421.7 2891568111 22061.0
It takes about 9.17 mins to load data using read.table.
setwd("~/Dropbox/lammps/PMMA_big/atom600")
timeRecordB()
library(data.table)
### Load the data with fread
atom.600.1_fread=fread(input="atom.600_1.txt", sep=" ", stringsAsFactors = FALSE, fill=TRUE
#,col.names = c("atom-id","type","mol","xu","yu","zu")
,colClasses = c("numeric","numeric","numeric","numeric","numeric","numeric","character","character")
,na.strings = c("ITEM:","TIMESTEP","NUMBER","OF","ATOMS","BOX","BOUNDS", "pp","id","type","mol","xu","yu","zu")
)
##
Read 0.0% of 192723537 rows
Read 0.7% of 192723537 rows
Read 1.3% of 192723537 rows
Read 2.0% of 192723537 rows
Read 2.7% of 192723537 rows
Read 3.3% of 192723537 rows
Read 4.0% of 192723537 rows
Read 4.7% of 192723537 rows
Read 5.3% of 192723537 rows
Read 6.0% of 192723537 rows
Read 6.7% of 192723537 rows
Read 7.4% of 192723537 rows
Read 8.0% of 192723537 rows
Read 8.7% of 192723537 rows
Read 9.4% of 192723537 rows
Read 10.0% of 192723537 rows
Read 10.7% of 192723537 rows
Read 11.4% of 192723537 rows
Read 12.0% of 192723537 rows
Read 12.7% of 192723537 rows
Read 13.4% of 192723537 rows
Read 14.0% of 192723537 rows
Read 14.7% of 192723537 rows
Read 15.4% of 192723537 rows
Read 16.0% of 192723537 rows
Read 16.7% of 192723537 rows
Read 17.4% of 192723537 rows
Read 18.0% of 192723537 rows
Read 18.7% of 192723537 rows
Read 19.4% of 192723537 rows
Read 20.1% of 192723537 rows
Read 20.7% of 192723537 rows
Read 21.4% of 192723537 rows
Read 22.0% of 192723537 rows
Read 22.7% of 192723537 rows
Read 23.4% of 192723537 rows
Read 24.1% of 192723537 rows
Read 24.7% of 192723537 rows
Read 25.4% of 192723537 rows
Read 26.1% of 192723537 rows
Read 26.7% of 192723537 rows
Read 27.4% of 192723537 rows
Read 28.1% of 192723537 rows
Read 28.7% of 192723537 rows
Read 29.4% of 192723537 rows
Read 30.1% of 192723537 rows
Read 30.7% of 192723537 rows
Read 31.4% of 192723537 rows
Read 32.1% of 192723537 rows
Read 32.7% of 192723537 rows
Read 33.4% of 192723537 rows
Read 34.1% of 192723537 rows
Read 34.8% of 192723537 rows
Read 35.4% of 192723537 rows
Read 36.1% of 192723537 rows
Read 36.8% of 192723537 rows
Read 37.4% of 192723537 rows
Read 38.1% of 192723537 rows
Read 38.8% of 192723537 rows
Read 39.4% of 192723537 rows
Read 40.1% of 192723537 rows
Read 40.8% of 192723537 rows
Read 41.4% of 192723537 rows
Read 42.1% of 192723537 rows
Read 42.8% of 192723537 rows
Read 43.4% of 192723537 rows
Read 44.1% of 192723537 rows
Read 44.8% of 192723537 rows
Read 45.4% of 192723537 rows
Read 46.1% of 192723537 rows
Read 46.8% of 192723537 rows
Read 47.5% of 192723537 rows
Read 48.1% of 192723537 rows
Read 48.8% of 192723537 rows
Read 49.5% of 192723537 rows
Read 50.1% of 192723537 rows
Read 50.8% of 192723537 rows
Read 51.5% of 192723537 rows
Read 52.2% of 192723537 rows
Read 52.8% of 192723537 rows
Read 53.4% of 192723537 rows
Read 54.1% of 192723537 rows
Read 54.8% of 192723537 rows
Read 55.4% of 192723537 rows
Read 56.0% of 192723537 rows
Read 56.6% of 192723537 rows
Read 57.1% of 192723537 rows
Read 57.7% of 192723537 rows
Read 58.4% of 192723537 rows
Read 59.1% of 192723537 rows
Read 59.7% of 192723537 rows
Read 60.4% of 192723537 rows
Read 61.1% of 192723537 rows
Read 61.7% of 192723537 rows
Read 62.4% of 192723537 rows
Read 63.1% of 192723537 rows
Read 63.7% of 192723537 rows
Read 64.4% of 192723537 rows
Read 65.1% of 192723537 rows
Read 65.7% of 192723537 rows
Read 66.4% of 192723537 rows
Read 67.1% of 192723537 rows
Read 67.7% of 192723537 rows
Read 68.4% of 192723537 rows
Read 69.1% of 192723537 rows
Read 69.7% of 192723537 rows
Read 70.4% of 192723537 rows
Read 71.1% of 192723537 rows
Read 71.8% of 192723537 rows
Read 72.4% of 192723537 rows
Read 73.1% of 192723537 rows
Read 73.8% of 192723537 rows
Read 74.4% of 192723537 rows
Read 75.1% of 192723537 rows
Read 75.8% of 192723537 rows
Read 76.4% of 192723537 rows
Read 77.1% of 192723537 rows
Read 77.8% of 192723537 rows
Read 78.5% of 192723537 rows
Read 79.1% of 192723537 rows
Read 79.8% of 192723537 rows
Read 80.5% of 192723537 rows
Read 81.1% of 192723537 rows
Read 81.8% of 192723537 rows
Read 82.5% of 192723537 rows
Read 83.2% of 192723537 rows
Read 83.8% of 192723537 rows
Read 84.5% of 192723537 rows
Read 85.2% of 192723537 rows
Read 85.9% of 192723537 rows
Read 86.5% of 192723537 rows
Read 87.2% of 192723537 rows
Read 87.9% of 192723537 rows
Read 88.5% of 192723537 rows
Read 89.2% of 192723537 rows
Read 89.9% of 192723537 rows
Read 90.6% of 192723537 rows
Read 91.2% of 192723537 rows
Read 91.9% of 192723537 rows
Read 92.6% of 192723537 rows
Read 93.3% of 192723537 rows
Read 93.9% of 192723537 rows
Read 94.6% of 192723537 rows
Read 95.3% of 192723537 rows
Read 95.9% of 192723537 rows
Read 96.6% of 192723537 rows
Read 97.3% of 192723537 rows
Read 98.0% of 192723537 rows
Read 98.6% of 192723537 rows
Read 99.3% of 192723537 rows
Read 100.0% of 192723537 rows
Read 192723537 rows and 8 (of 8) columns from 6.350 GB file in 00:02:37
timeRecordB(output_message = "Load in data fread")
# select the non-NA columns
atom.600.1_fread=atom.600.1_fread[,.(V1,V2,V3,V4,V5,V6)]
# clear the rows that have NA values, V6 contains the most NAs.
atom.600.1_fread=atom.600.1_fread[complete.cases(atom.600.1_fread[,V6])]
colnames(atom.600.1_fread)=c("atom.id","type","mol","xu","yu","zu")
head(atom.600.1_fread,10)
## atom.id type mol xu yu zu
## 1: 25279 1 41 6.17609 7.248790 80.88420
## 2: 25257 5 41 5.40093 4.374650 83.24070
## 3: 25258 8 41 5.81708 3.137150 8.04060
## 4: 25265 9 41 5.26290 2.361650 7.71487
## 5: 25262 1 41 8.53933 4.642980 83.60970
## 6: 25259 2 41 8.17998 4.895660 82.56890
## 7: 25260 1 41 8.95838 5.010800 81.94050
## 8: 10181 1 16 7.31981 0.188811 9.74031
## 9: 25256 4 41 5.91708 5.588160 7.58943
## 10: 25255 3 41 7.03558 6.049600 82.56140
It takes 2 mins 45 seconds to load data with fread.
setwd("~/Dropbox/lammps/PMMA_big/atom600")
timeRecordR(unit="min")%>%select(output_message,run_time)%>%filter(output_message!="None")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## output_message run_time
## 1 Load in data 7.828783
## 2 Load in data fread 2.654200
The clear winner is using fread instead of read.table. About 3 times faster.
library(microbenchmark)
microbenchmark(dplyr=atom.600.1%>%filter(atom.id==1)%>%select(xu,yu,zu)%>%MSD
,base_use=atom.600.1[atom.600.1[,"atom.id"]==1,]%>%.[,c("xu","yu","zu")]%>%MSD
,mix_use=atom.600.1[atom.600.1[,"atom.id"]==1,]%>%select(xu,yu,zu)%>%MSD
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## dplyr 800.5524 816.9284 845.4323 822.1936 836.2008 1069.277 100
## base_use 1311.8759 1378.0593 1435.4482 1391.1968 1445.6690 1849.614 100
## mix_use 1367.3806 1384.4002 1422.2057 1392.8801 1422.0988 1885.130 100
## cld
## a
## b
## b
Use the first expression from dplyr pacakge, which is the fastest.
atom.600.1[,"atom.id"]%>%max()
## [1] 38528
atom.600.1[,"atom.id"]%>%min()
## [1] 1
atom.600.1[,"mol"]%>%max()
## [1] 63
atom.600.1[,"mol"]%>%min()
## [1] 0
This is faster than select.
microbenchmark(
MSD.all.matrix.600=foreach(n=1:2,.combine=rbind)%do%{
atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)%>%MSD
}
,
MSD.all.matrix.600.2=foreach(n=1:2,.combine=rbind)%do%{
atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)
}
,times = 10
)
## Unit: seconds
## expr min lq mean median uq
## MSD.all.matrix.600 1.493174 1.502222 1.525782 1.516036 1.555009
## MSD.all.matrix.600.2 1.486548 1.503439 1.588392 1.525168 1.642339
## max neval cld
## 1.571679 10 a
## 1.893911 10 a
These two expressions use about the same amount of time.
The performance bottleneck is not calculating the MSD but searching for all atom.id==1 and select xu,yu,zu values to calculate MSD
library(data.table)
atom.600.1.data_table=atom.600.1%>%as.data.table
timeRecordB()
microbenchmark(
MSD.all.matrix.600=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)%>%MSD
}
,
MSD.all.matrix.600.2=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)
}
,
MSD.all.matrix.600.3=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1.data_table%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.4=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,times = 10
)
## Unit: milliseconds
## expr min lq mean median
## MSD.all.matrix.600 7324.89997 7342.07416 7356.3900 7354.87272
## MSD.all.matrix.600.2 7346.53965 7353.92788 7364.6156 7362.91897
## MSD.all.matrix.600.3 19.21991 20.11215 797.3469 24.09992
## MSD.all.matrix.600.4 14.94782 15.50254 739.6977 17.84564
## uq max neval cld
## 7363.61205 7390.209 10 b
## 7373.19359 7400.873 10 b
## 25.75033 7759.992 10 a
## 27.29748 7221.522 10 a
timeRecordB(output_message = "benchmark data.table searching")
Using data.table gives a huge speed-up but the exact speed-up is hard to say. Becasue the maximum and minimum fluctuates a lot.
timeRecordB()
microbenchmark(
MSD.all.matrix.600.3=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.4=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,times = 200
)
## Unit: milliseconds
## expr min lq mean median uq
## MSD.all.matrix.600.3 22.26689 22.53669 24.82865 23.45585 26.28687
## MSD.all.matrix.600.4 17.68947 17.88242 19.38793 18.31967 20.83256
## max neval cld
## 101.92413 200 b
## 26.17377 200 a
Compare using onestep and two step calculation. The result is clear that the one step search is faster than a two step search.
Later calculation on the entire dataset confirms the faster in speed in about 5 seconds in the total calculation time of around 300 seconds.
microbenchmark(
MSD.all.matrix.600.1=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1.data_table%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.2=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1.data_table%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.3=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.4=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,times = 200
)
## Unit: milliseconds
## expr min lq mean median uq
## MSD.all.matrix.600.1 41.33772 42.49663 46.09869 45.61239 47.70668
## MSD.all.matrix.600.2 32.64830 33.36768 36.48657 35.33815 37.61424
## MSD.all.matrix.600.3 41.28018 42.49388 45.61827 45.66416 46.73383
## MSD.all.matrix.600.4 32.75910 33.46851 36.07190 35.98610 37.50148
## max neval cld
## 130.22782 200 b
## 115.78450 200 a
## 55.76309 200 b
## 43.58912 200 a
Compare using different dataset and different search methods.
Results the data from fread is about the same as data converted from read.table.
One step search is definitely to be faster than two step search.
First set the key for the entire dataset.
atom.600.1_fread_withkey=atom.600.1_fread
setkey(atom.600.1_fread_withkey,atom.id)
Then compare the one that with key and one without the key.
microbenchmark(
MSD.all.matrix.600.nokey=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.withkey=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
}
,times = 200
)
## Unit: milliseconds
## expr min lq mean median uq
## MSD.all.matrix.600.nokey 23.65122 25.81662 28.61165 28.27606 31.13261
## MSD.all.matrix.600.withkey 22.83978 24.79721 27.32185 27.14373 29.20825
## max neval cld
## 41.46325 200 b
## 38.57697 200 a
Using key give a slightly worse performance but I would say they are statistically about the same. No data so that no t.test could be done.
Define the MSD in a data.table way.
MSD2=function(data){
MSD=(data[,xu]-data[1,xu])^2+(data[,yu]-data[1,yu])^2+(data[,zu]-data[1,zu])^2
}
microbenchmark(
MSD.all.matrix.600.nokey_MSD1=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.withkey_MSD1=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.nokey_MSD2=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD2
}
,
MSD.all.matrix.600.withkey_MSD2=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD2
}
,times = 200
)
## Unit: milliseconds
## expr min lq mean median
## MSD.all.matrix.600.nokey_MSD1 23.65118 26.66722 28.99298 28.60272
## MSD.all.matrix.600.withkey_MSD1 22.88366 25.56328 28.11521 27.60572
## MSD.all.matrix.600.nokey_MSD2 45.66111 50.58503 54.99623 54.02746
## MSD.all.matrix.600.withkey_MSD2 44.55914 50.16693 55.50511 54.96421
## uq max neval cld
## 31.76064 40.48159 200 a
## 30.34622 41.44011 200 a
## 59.71503 71.06647 200 b
## 58.35046 138.33240 200 b
The original MSD function gives better results than the new MSD2 function. About double the speed. Plus, using key gives slightly better performance. very small improvement.
Next, we need to compare if the calculation with MSD and without MSD in order to see if MSD calculation takes a long time.
microbenchmark(
MSD.all.matrix.600.nokey_MSD1=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.nokey_noMSD=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]
}
,
MSD.all.matrix.600.withkey_MSD1=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.withkey_noMSD=foreach(n=1:20,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]
}
,times = 200
)
## Unit: milliseconds
## expr min lq mean median
## MSD.all.matrix.600.nokey_MSD1 23.75559 26.71898 29.28920 28.88816
## MSD.all.matrix.600.nokey_noMSD 22.01079 24.77466 27.11576 26.69261
## MSD.all.matrix.600.withkey_MSD1 22.91869 26.10989 28.10347 27.72237
## MSD.all.matrix.600.withkey_noMSD 21.15140 23.28152 26.41954 25.36324
## uq max neval cld
## 31.97891 41.54170 200 c
## 29.73129 38.54494 200 ab
## 30.06351 40.62538 200 b
## 28.68609 106.24783 200 a
About a 3 millisecond more for calculation with MSD. Possibly could improve with C code. But overall effect would be small? Plus, using key gives a marginally quicker calculation.
Need to know which MSD and data do you use.
Use the original MSD function, and the data from fread with and without key.
microbenchmark(
MSD.all.matrix.600.nokey_MSD1_noforeach=atom.600.1_fread%>%.[atom.id==100,.(xu,yu,zu)]%>%MSD
,
MSD.all.matrix.600.withkey_MSD1_noforeach=atom.600.1_fread_withkey%>%.[.(100),.(xu,yu,zu)]%>%MSD
,
MSD.all.matrix.600.nokey_MSD1=foreach(n=1:100,.combine=rbind)%do%{
atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
}
,
MSD.all.matrix.600.withkey_MSD1=foreach(n=1:100,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
}
,times = 200
)
## Unit: microseconds
## expr min lq
## MSD.all.matrix.600.nokey_MSD1_noforeach 857.676 945.3465
## MSD.all.matrix.600.withkey_MSD1_noforeach 815.355 922.3335
## MSD.all.matrix.600.nokey_MSD1 116339.329 120875.9115
## MSD.all.matrix.600.withkey_MSD1 112294.480 116770.6200
## mean median uq max neval cld
## 1100.740 1061.912 1188.658 3416.612 200 a
## 1083.747 1015.973 1121.590 3532.806 200 a
## 132536.803 132278.905 141223.197 218724.809 200 c
## 128682.409 128524.393 136779.942 212534.229 200 b
There is some room for improvement here for using foreach. Using foreach gives a much slower calculation. Like a lot slower.
# two step search
( (atom.600.1.data_table%>%.[atom.id==2000]%>%.[,.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search
( (atom.600.1_fread%>%.[atom.id==2000,.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key
( (atom.600.1_fread_withkey%>%.[atom.id==2000,.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key and new MSD function
( (atom.600.1_fread_withkey%>%.[atom.id==2000,.(xu,yu,zu)]%>%MSD2)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key
( (atom.600.1_fread_withkey%>%.[.(2000),.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key and new MSD function
( (atom.600.1_fread_withkey%>%.[.(2000),.(xu,yu,zu)]%>%MSD2)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
It seems that using purely the key is faster but gives different result.
Find out the difference between using key and not using key
((atom.600.1_fread_withkey%>%.[atom.id==1000,.(xu,yu,zu)]%>%MSD2)-(atom.600.1_fread_withkey%>%.[.(1000),.(xu,yu,zu)]%>%MSD) )%>%plot()
The difference is kind of big, but why? Read the cheat sheet for data.table. But leave it for now, since this is not a serious bottleneck right now.
setkey(atom.600.1_fread_withkey,atom.id)
(atom.600.1_fread_withkey%>%.[atom.id==1000,.(xu,yu,zu)])%>%dim()
## [1] 5001 3
(atom.600.1_fread_withkey[.(1000),.(xu,yu,zu)])%>%dim()
## [1] 5001 3
Using key to search for the value is not doing as it is stated in the cheat-sheet from datacamp for data.table package. It only returned one value.
Read more about it later. I figured out how to work with the key. If the key column is integer or numeric, you will need to do .(10) to select key rows that have a value of 10.
After much test, the possible improvement could be done with the foreach, and possible improvement with MSD written in C.
Let us try to optimize on how to combine the results and compare it with foreach.
## Need to define MSD function and load data.table library before using this function. Plus the the key for
MSD_matrix=function(data,timestep,tot_atoms){
MSD_empty_matrix=matrix(NA,nrow=tot_atoms,ncol=timestep)
for(i in 1:tot_atoms){
MSD_empty_matrix[i,]=data%>%.[.(i),.(xu,yu,zu)]%>%MSD
}
return(MSD_empty_matrix)
}
microbenchmark(
MSD.all.matrix.600.withkey_MSD1_for=MSD_matrix(data=atom.600.1_fread_withkey,timestep = 5001,tot_atoms = 10)
,
MSD.all.matrix.600.withkey_MSD1_foreach=foreach(n=1:10,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
}
,times = 100
)
## Unit: milliseconds
## expr min lq mean
## MSD.all.matrix.600.withkey_MSD1_for 8.606494 8.928119 10.55628
## MSD.all.matrix.600.withkey_MSD1_foreach 12.618437 12.988087 15.34294
## median uq max neval cld
## 10.23371 11.42610 16.81531 100 a
## 15.18762 17.05637 21.36556 100 b
Using for within a function with replacement is the fastest method so far. The improvement is not dramatic
check if they gives the same results.
( (MSD_matrix(data=atom.600.1_fread_withkey,timestep = 5001,tot_atoms = 1))==(atom.600.1%>%filter(atom.id==1)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
check if they gives the same results.
(
MSD_matrix(data=atom.600.1_fread_withkey,timestep = 5001,tot_atoms = 10)[5,]==(foreach(n=1:100,.combine=rbind)%do%{
atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
})[5,]
)%>%sum
## [1] 5001