Testing some codes to see the performance for optimization.

setwd("~/Dropbox/lammps/PMMA_big/atom600")
source("https://raw.githubusercontent.com/edwardcooper/mlmodel_select/master/timeRecord_functions.R")

MSD=function(data){
  (data$xu-data$xu[1])^2+ (data$yu-data$yu[1])^2+ (data$zu-data$zu[1])^2
}
library(foreach)
library(magrittr)
library(microbenchmark)






timeRecordB()
## [1] "Fun time log has been created"
### Load the data
atom.600.1=read.table(file="atom.600_1.txt",header=FALSE, sep=" ", stringsAsFactors = FALSE, fill=TRUE
                      ,col.names = c("atom-id","type","mol","xu","yu","zu")
                      ,colClasses = c("numeric","numeric","numeric","numeric","numeric","numeric")
                      ,na.strings = c("ITEM:","TIMESTEP","NUMBER","OF","ATOMS","BOX","BOUNDS", "pp","id","type","mol","xu","yu","zu")
)
timeRecordB(output_message = "Load in data")
# Clear out all rows with NAs.
atom.600.1=atom.600.1[complete.cases(atom.600.1),]
gc()
##              used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells     568111   30.4     940480    50.3     940480    50.3
## Vcells 1253425720 9562.9 2938853014 22421.7 2891568111 22061.0

It takes about 9.17 mins to load data using read.table.

try to use fread instead of read.table to read file into R.

setwd("~/Dropbox/lammps/PMMA_big/atom600")
timeRecordB()
library(data.table)
### Load the data with fread
atom.600.1_fread=fread(input="atom.600_1.txt", sep=" ", stringsAsFactors = FALSE, fill=TRUE
                      #,col.names = c("atom-id","type","mol","xu","yu","zu")
                      ,colClasses = c("numeric","numeric","numeric","numeric","numeric","numeric","character","character")
                      ,na.strings = c("ITEM:","TIMESTEP","NUMBER","OF","ATOMS","BOX","BOUNDS", "pp","id","type","mol","xu","yu","zu")
                      
)
## 
Read 0.0% of 192723537 rows
Read 0.7% of 192723537 rows
Read 1.3% of 192723537 rows
Read 2.0% of 192723537 rows
Read 2.7% of 192723537 rows
Read 3.3% of 192723537 rows
Read 4.0% of 192723537 rows
Read 4.7% of 192723537 rows
Read 5.3% of 192723537 rows
Read 6.0% of 192723537 rows
Read 6.7% of 192723537 rows
Read 7.4% of 192723537 rows
Read 8.0% of 192723537 rows
Read 8.7% of 192723537 rows
Read 9.4% of 192723537 rows
Read 10.0% of 192723537 rows
Read 10.7% of 192723537 rows
Read 11.4% of 192723537 rows
Read 12.0% of 192723537 rows
Read 12.7% of 192723537 rows
Read 13.4% of 192723537 rows
Read 14.0% of 192723537 rows
Read 14.7% of 192723537 rows
Read 15.4% of 192723537 rows
Read 16.0% of 192723537 rows
Read 16.7% of 192723537 rows
Read 17.4% of 192723537 rows
Read 18.0% of 192723537 rows
Read 18.7% of 192723537 rows
Read 19.4% of 192723537 rows
Read 20.1% of 192723537 rows
Read 20.7% of 192723537 rows
Read 21.4% of 192723537 rows
Read 22.0% of 192723537 rows
Read 22.7% of 192723537 rows
Read 23.4% of 192723537 rows
Read 24.1% of 192723537 rows
Read 24.7% of 192723537 rows
Read 25.4% of 192723537 rows
Read 26.1% of 192723537 rows
Read 26.7% of 192723537 rows
Read 27.4% of 192723537 rows
Read 28.1% of 192723537 rows
Read 28.7% of 192723537 rows
Read 29.4% of 192723537 rows
Read 30.1% of 192723537 rows
Read 30.7% of 192723537 rows
Read 31.4% of 192723537 rows
Read 32.1% of 192723537 rows
Read 32.7% of 192723537 rows
Read 33.4% of 192723537 rows
Read 34.1% of 192723537 rows
Read 34.8% of 192723537 rows
Read 35.4% of 192723537 rows
Read 36.1% of 192723537 rows
Read 36.8% of 192723537 rows
Read 37.4% of 192723537 rows
Read 38.1% of 192723537 rows
Read 38.8% of 192723537 rows
Read 39.4% of 192723537 rows
Read 40.1% of 192723537 rows
Read 40.8% of 192723537 rows
Read 41.4% of 192723537 rows
Read 42.1% of 192723537 rows
Read 42.8% of 192723537 rows
Read 43.4% of 192723537 rows
Read 44.1% of 192723537 rows
Read 44.8% of 192723537 rows
Read 45.4% of 192723537 rows
Read 46.1% of 192723537 rows
Read 46.8% of 192723537 rows
Read 47.5% of 192723537 rows
Read 48.1% of 192723537 rows
Read 48.8% of 192723537 rows
Read 49.5% of 192723537 rows
Read 50.1% of 192723537 rows
Read 50.8% of 192723537 rows
Read 51.5% of 192723537 rows
Read 52.2% of 192723537 rows
Read 52.8% of 192723537 rows
Read 53.4% of 192723537 rows
Read 54.1% of 192723537 rows
Read 54.8% of 192723537 rows
Read 55.4% of 192723537 rows
Read 56.0% of 192723537 rows
Read 56.6% of 192723537 rows
Read 57.1% of 192723537 rows
Read 57.7% of 192723537 rows
Read 58.4% of 192723537 rows
Read 59.1% of 192723537 rows
Read 59.7% of 192723537 rows
Read 60.4% of 192723537 rows
Read 61.1% of 192723537 rows
Read 61.7% of 192723537 rows
Read 62.4% of 192723537 rows
Read 63.1% of 192723537 rows
Read 63.7% of 192723537 rows
Read 64.4% of 192723537 rows
Read 65.1% of 192723537 rows
Read 65.7% of 192723537 rows
Read 66.4% of 192723537 rows
Read 67.1% of 192723537 rows
Read 67.7% of 192723537 rows
Read 68.4% of 192723537 rows
Read 69.1% of 192723537 rows
Read 69.7% of 192723537 rows
Read 70.4% of 192723537 rows
Read 71.1% of 192723537 rows
Read 71.8% of 192723537 rows
Read 72.4% of 192723537 rows
Read 73.1% of 192723537 rows
Read 73.8% of 192723537 rows
Read 74.4% of 192723537 rows
Read 75.1% of 192723537 rows
Read 75.8% of 192723537 rows
Read 76.4% of 192723537 rows
Read 77.1% of 192723537 rows
Read 77.8% of 192723537 rows
Read 78.5% of 192723537 rows
Read 79.1% of 192723537 rows
Read 79.8% of 192723537 rows
Read 80.5% of 192723537 rows
Read 81.1% of 192723537 rows
Read 81.8% of 192723537 rows
Read 82.5% of 192723537 rows
Read 83.2% of 192723537 rows
Read 83.8% of 192723537 rows
Read 84.5% of 192723537 rows
Read 85.2% of 192723537 rows
Read 85.9% of 192723537 rows
Read 86.5% of 192723537 rows
Read 87.2% of 192723537 rows
Read 87.9% of 192723537 rows
Read 88.5% of 192723537 rows
Read 89.2% of 192723537 rows
Read 89.9% of 192723537 rows
Read 90.6% of 192723537 rows
Read 91.2% of 192723537 rows
Read 91.9% of 192723537 rows
Read 92.6% of 192723537 rows
Read 93.3% of 192723537 rows
Read 93.9% of 192723537 rows
Read 94.6% of 192723537 rows
Read 95.3% of 192723537 rows
Read 95.9% of 192723537 rows
Read 96.6% of 192723537 rows
Read 97.3% of 192723537 rows
Read 98.0% of 192723537 rows
Read 98.6% of 192723537 rows
Read 99.3% of 192723537 rows
Read 100.0% of 192723537 rows
Read 192723537 rows and 8 (of 8) columns from 6.350 GB file in 00:02:37
timeRecordB(output_message = "Load in data fread")
# select the non-NA columns
atom.600.1_fread=atom.600.1_fread[,.(V1,V2,V3,V4,V5,V6)]
# clear the rows that have NA values, V6 contains the most NAs. 
atom.600.1_fread=atom.600.1_fread[complete.cases(atom.600.1_fread[,V6])]
colnames(atom.600.1_fread)=c("atom.id","type","mol","xu","yu","zu")
head(atom.600.1_fread,10)
##     atom.id type mol      xu       yu       zu
##  1:   25279    1  41 6.17609 7.248790 80.88420
##  2:   25257    5  41 5.40093 4.374650 83.24070
##  3:   25258    8  41 5.81708 3.137150  8.04060
##  4:   25265    9  41 5.26290 2.361650  7.71487
##  5:   25262    1  41 8.53933 4.642980 83.60970
##  6:   25259    2  41 8.17998 4.895660 82.56890
##  7:   25260    1  41 8.95838 5.010800 81.94050
##  8:   10181    1  16 7.31981 0.188811  9.74031
##  9:   25256    4  41 5.91708 5.588160  7.58943
## 10:   25255    3  41 7.03558 6.049600 82.56140

It takes 2 mins 45 seconds to load data with fread.

setwd("~/Dropbox/lammps/PMMA_big/atom600")
timeRecordR(unit="min")%>%select(output_message,run_time)%>%filter(output_message!="None")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##       output_message run_time
## 1       Load in data 7.828783
## 2 Load in data fread 2.654200

The clear winner is using fread instead of read.table. About 3 times faster.

library(microbenchmark)

microbenchmark(dplyr=atom.600.1%>%filter(atom.id==1)%>%select(xu,yu,zu)%>%MSD
               ,base_use=atom.600.1[atom.600.1[,"atom.id"]==1,]%>%.[,c("xu","yu","zu")]%>%MSD
               ,mix_use=atom.600.1[atom.600.1[,"atom.id"]==1,]%>%select(xu,yu,zu)%>%MSD
               )
## Unit: milliseconds
##      expr       min        lq      mean    median        uq      max neval
##     dplyr  800.5524  816.9284  845.4323  822.1936  836.2008 1069.277   100
##  base_use 1311.8759 1378.0593 1435.4482 1391.1968 1445.6690 1849.614   100
##   mix_use 1367.3806 1384.4002 1422.2057 1392.8801 1422.0988 1885.130   100
##  cld
##   a 
##    b
##    b

Use the first expression from dplyr pacakge, which is the fastest.

atom.600.1[,"atom.id"]%>%max()
## [1] 38528
atom.600.1[,"atom.id"]%>%min()
## [1] 1
atom.600.1[,"mol"]%>%max()
## [1] 63
atom.600.1[,"mol"]%>%min()
## [1] 0

This is faster than select.

microbenchmark(
  
  MSD.all.matrix.600=foreach(n=1:2,.combine=rbind)%do%{ 
    atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)%>%MSD
  } 
  ,
  MSD.all.matrix.600.2=foreach(n=1:2,.combine=rbind)%do%{ 
    atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)
  }
  ,times = 10
)
## Unit: seconds
##                  expr      min       lq     mean   median       uq
##    MSD.all.matrix.600 1.493174 1.502222 1.525782 1.516036 1.555009
##  MSD.all.matrix.600.2 1.486548 1.503439 1.588392 1.525168 1.642339
##       max neval cld
##  1.571679    10   a
##  1.893911    10   a

These two expressions use about the same amount of time.

The performance bottleneck is not calculating the MSD but searching for all atom.id==1 and select xu,yu,zu values to calculate MSD

Try to use data.table instead of dplyr

library(data.table)
atom.600.1.data_table=atom.600.1%>%as.data.table
timeRecordB()
microbenchmark(
  
  MSD.all.matrix.600=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)%>%MSD
  } 
  ,
  MSD.all.matrix.600.2=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1%>%filter(atom.id==n)%>%select(xu,yu,zu)
  }
  ,
  MSD.all.matrix.600.3=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1.data_table%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.4=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,times = 10
)
## Unit: milliseconds
##                  expr        min         lq      mean     median
##    MSD.all.matrix.600 7324.89997 7342.07416 7356.3900 7354.87272
##  MSD.all.matrix.600.2 7346.53965 7353.92788 7364.6156 7362.91897
##  MSD.all.matrix.600.3   19.21991   20.11215  797.3469   24.09992
##  MSD.all.matrix.600.4   14.94782   15.50254  739.6977   17.84564
##          uq      max neval cld
##  7363.61205 7390.209    10   b
##  7373.19359 7400.873    10   b
##    25.75033 7759.992    10  a 
##    27.29748 7221.522    10  a
timeRecordB(output_message = "benchmark data.table searching")

Using data.table gives a huge speed-up but the exact speed-up is hard to say. Becasue the maximum and minimum fluctuates a lot.

timeRecordB()
microbenchmark(
  MSD.all.matrix.600.3=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.4=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,times = 200
)
## Unit: milliseconds
##                  expr      min       lq     mean   median       uq
##  MSD.all.matrix.600.3 22.26689 22.53669 24.82865 23.45585 26.28687
##  MSD.all.matrix.600.4 17.68947 17.88242 19.38793 18.31967 20.83256
##        max neval cld
##  101.92413   200   b
##   26.17377   200  a

Compare using onestep and two step calculation. The result is clear that the one step search is faster than a two step search.

Later calculation on the entire dataset confirms the faster in speed in about 5 seconds in the total calculation time of around 300 seconds.

microbenchmark(
   MSD.all.matrix.600.1=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1.data_table%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.2=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1.data_table%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.3=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n]%>%.[,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.4=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,times = 200
)
## Unit: milliseconds
##                  expr      min       lq     mean   median       uq
##  MSD.all.matrix.600.1 41.33772 42.49663 46.09869 45.61239 47.70668
##  MSD.all.matrix.600.2 32.64830 33.36768 36.48657 35.33815 37.61424
##  MSD.all.matrix.600.3 41.28018 42.49388 45.61827 45.66416 46.73383
##  MSD.all.matrix.600.4 32.75910 33.46851 36.07190 35.98610 37.50148
##        max neval cld
##  130.22782   200   b
##  115.78450   200  a 
##   55.76309   200   b
##   43.58912   200  a

Compare using different dataset and different search methods.

Results the data from fread is about the same as data converted from read.table.

One step search is definitely to be faster than two step search.

Try to use setkey to see if there is an speed-up

First set the key for the entire dataset.

atom.600.1_fread_withkey=atom.600.1_fread
setkey(atom.600.1_fread_withkey,atom.id)

Then compare the one that with key and one without the key.

microbenchmark(
   
  MSD.all.matrix.600.nokey=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.withkey=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
  }
  ,times = 200
)
## Unit: milliseconds
##                        expr      min       lq     mean   median       uq
##    MSD.all.matrix.600.nokey 23.65122 25.81662 28.61165 28.27606 31.13261
##  MSD.all.matrix.600.withkey 22.83978 24.79721 27.32185 27.14373 29.20825
##       max neval cld
##  41.46325   200   b
##  38.57697   200  a

Using key give a slightly worse performance but I would say they are statistically about the same. No data so that no t.test could be done.

Let us look at the MSD function to see if we could improve its speed.

Define the MSD in a data.table way.

MSD2=function(data){
    MSD=(data[,xu]-data[1,xu])^2+(data[,yu]-data[1,yu])^2+(data[,zu]-data[1,zu])^2
    }
microbenchmark(
   
  MSD.all.matrix.600.nokey_MSD1=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.withkey_MSD1=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.nokey_MSD2=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD2
  }
  ,
  MSD.all.matrix.600.withkey_MSD2=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD2
  }
  ,times = 200
)
## Unit: milliseconds
##                             expr      min       lq     mean   median
##    MSD.all.matrix.600.nokey_MSD1 23.65118 26.66722 28.99298 28.60272
##  MSD.all.matrix.600.withkey_MSD1 22.88366 25.56328 28.11521 27.60572
##    MSD.all.matrix.600.nokey_MSD2 45.66111 50.58503 54.99623 54.02746
##  MSD.all.matrix.600.withkey_MSD2 44.55914 50.16693 55.50511 54.96421
##        uq       max neval cld
##  31.76064  40.48159   200  a 
##  30.34622  41.44011   200  a 
##  59.71503  71.06647   200   b
##  58.35046 138.33240   200   b

The original MSD function gives better results than the new MSD2 function. About double the speed. Plus, using key gives slightly better performance. very small improvement.

Next, we need to compare if the calculation with MSD and without MSD in order to see if MSD calculation takes a long time.

microbenchmark(
   
  MSD.all.matrix.600.nokey_MSD1=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.nokey_noMSD=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]
  }
  ,
  MSD.all.matrix.600.withkey_MSD1=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
  }
  ,
   MSD.all.matrix.600.withkey_noMSD=foreach(n=1:20,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]
  }
  ,times = 200
)
## Unit: milliseconds
##                              expr      min       lq     mean   median
##     MSD.all.matrix.600.nokey_MSD1 23.75559 26.71898 29.28920 28.88816
##    MSD.all.matrix.600.nokey_noMSD 22.01079 24.77466 27.11576 26.69261
##   MSD.all.matrix.600.withkey_MSD1 22.91869 26.10989 28.10347 27.72237
##  MSD.all.matrix.600.withkey_noMSD 21.15140 23.28152 26.41954 25.36324
##        uq       max neval cld
##  31.97891  41.54170   200   c
##  29.73129  38.54494   200 ab 
##  30.06351  40.62538   200  b 
##  28.68609 106.24783   200 a

About a 3 millisecond more for calculation with MSD. Possibly could improve with C code. But overall effect would be small? Plus, using key gives a marginally quicker calculation.

Next, let us see if the foreach function takes a long time

Need to know which MSD and data do you use.

Use the original MSD function, and the data from fread with and without key.

microbenchmark(
  MSD.all.matrix.600.nokey_MSD1_noforeach=atom.600.1_fread%>%.[atom.id==100,.(xu,yu,zu)]%>%MSD
  ,
  MSD.all.matrix.600.withkey_MSD1_noforeach=atom.600.1_fread_withkey%>%.[.(100),.(xu,yu,zu)]%>%MSD
  ,
   MSD.all.matrix.600.nokey_MSD1=foreach(n=1:100,.combine=rbind)%do%{ 
    atom.600.1_fread%>%.[atom.id==n,.(xu,yu,zu)]%>%MSD
  }
  ,
  MSD.all.matrix.600.withkey_MSD1=foreach(n=1:100,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
  }
  ,times = 200
)
## Unit: microseconds
##                                       expr        min          lq
##    MSD.all.matrix.600.nokey_MSD1_noforeach    857.676    945.3465
##  MSD.all.matrix.600.withkey_MSD1_noforeach    815.355    922.3335
##              MSD.all.matrix.600.nokey_MSD1 116339.329 120875.9115
##            MSD.all.matrix.600.withkey_MSD1 112294.480 116770.6200
##        mean     median         uq        max neval cld
##    1100.740   1061.912   1188.658   3416.612   200 a  
##    1083.747   1015.973   1121.590   3532.806   200 a  
##  132536.803 132278.905 141223.197 218724.809   200   c
##  128682.409 128524.393 136779.942 212534.229   200  b

There is some room for improvement here for using foreach. Using foreach gives a much slower calculation. Like a lot slower.

check if the various data.table methods give the same answer as the the dplyr

# two step search
(  (atom.600.1.data_table%>%.[atom.id==2000]%>%.[,.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search
(  (atom.600.1_fread%>%.[atom.id==2000,.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key
(  (atom.600.1_fread_withkey%>%.[atom.id==2000,.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key and new MSD function
(  (atom.600.1_fread_withkey%>%.[atom.id==2000,.(xu,yu,zu)]%>%MSD2)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key
(  (atom.600.1_fread_withkey%>%.[.(2000),.(xu,yu,zu)]%>%MSD)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001
# one step search with key and new MSD function
(  (atom.600.1_fread_withkey%>%.[.(2000),.(xu,yu,zu)]%>%MSD2)==(atom.600.1%>%filter(atom.id==2000)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001

It seems that using purely the key is faster but gives different result.

Find out the difference between using key and not using key

((atom.600.1_fread_withkey%>%.[atom.id==1000,.(xu,yu,zu)]%>%MSD2)-(atom.600.1_fread_withkey%>%.[.(1000),.(xu,yu,zu)]%>%MSD) )%>%plot()

The difference is kind of big, but why? Read the cheat sheet for data.table. But leave it for now, since this is not a serious bottleneck right now.

setkey(atom.600.1_fread_withkey,atom.id)

(atom.600.1_fread_withkey%>%.[atom.id==1000,.(xu,yu,zu)])%>%dim()
## [1] 5001    3
(atom.600.1_fread_withkey[.(1000),.(xu,yu,zu)])%>%dim()
## [1] 5001    3

Using key to search for the value is not doing as it is stated in the cheat-sheet from datacamp for data.table package. It only returned one value.

Read more about it later. I figured out how to work with the key. If the key column is integer or numeric, you will need to do .(10) to select key rows that have a value of 10.

After much test, the possible improvement could be done with the foreach, and possible improvement with MSD written in C.

Let us try to optimize on how to combine the results and compare it with foreach.

## Need to define MSD function and load data.table library before using this function. Plus the the key for 
MSD_matrix=function(data,timestep,tot_atoms){
  MSD_empty_matrix=matrix(NA,nrow=tot_atoms,ncol=timestep)
  for(i in 1:tot_atoms){
    MSD_empty_matrix[i,]=data%>%.[.(i),.(xu,yu,zu)]%>%MSD
  }
  return(MSD_empty_matrix)
}
microbenchmark(
  
  MSD.all.matrix.600.withkey_MSD1_for=MSD_matrix(data=atom.600.1_fread_withkey,timestep = 5001,tot_atoms = 10)
  ,
  
   MSD.all.matrix.600.withkey_MSD1_foreach=foreach(n=1:10,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
   }
  ,times = 100
  
)
## Unit: milliseconds
##                                     expr       min        lq     mean
##      MSD.all.matrix.600.withkey_MSD1_for  8.606494  8.928119 10.55628
##  MSD.all.matrix.600.withkey_MSD1_foreach 12.618437 12.988087 15.34294
##    median       uq      max neval cld
##  10.23371 11.42610 16.81531   100  a 
##  15.18762 17.05637 21.36556   100   b

Using for within a function with replacement is the fastest method so far. The improvement is not dramatic

check if they gives the same results.

(  (MSD_matrix(data=atom.600.1_fread_withkey,timestep = 5001,tot_atoms = 1))==(atom.600.1%>%filter(atom.id==1)%>%select(xu,yu,zu)%>%MSD) )%>%sum()
## [1] 5001

check if they gives the same results.

(
  MSD_matrix(data=atom.600.1_fread_withkey,timestep = 5001,tot_atoms = 10)[5,]==(foreach(n=1:100,.combine=rbind)%do%{ 
    atom.600.1_fread_withkey%>%.[.(n),.(xu,yu,zu)]%>%MSD
   })[5,]
 )%>%sum
## [1] 5001