In [12]:
import h5py
import numpy as np
from random import sample
from random import seed
import time
import pandas as pd
from numpy.matlib import rand,randn
%matplotlib inline
import matplotlib.pyplot as plt
import h5pyd
In [15]:
#===============================================================================
# access local H5
#===============================================================================
f_local = h5py.File("/fh/fast/gottardo_r/mike_working/Malaria/gslist/BGZkZLOrEb/file393249629993.nc", "r")
ds_local = f_local["/28"]
ds_local.shape
Out[15]:
(22, 482911)
In [16]:
nChannel, nCells = ds_local.shape
In [3]:
def read_local(ds_local, idx1, idx2, isPointSelection = True):
    """random slicing local H5 (2d)
        ds_local h5py._hl.dataset.Dataset
        idx1 sorted random idx for dimension 1
        idx2 sorted random idx for dimension 2
        isPointSelection boolean whether do the point selection at H5 level
    """
    #preallocate output memory 
    vals1 = np.zeros((len(idx1), len(idx2)), dtype='float32')
    t1 = time.clock()
    #fix x dim and random slicing on y since h5py doesn't allow random selection on both 
    for i,x in enumerate(idx1):
        if(isPointSelection):
            vals1[i,:] = ds_local[x, idx2]
        else:
            vals1[i,:] = ds_local[x, :][idx2]
    t2 = time.clock() - t1
    return t2
In [4]:
#===============================================================================
# create remote h5
#===============================================================================
f_remote = h5pyd.File("/home/wjiang2/tcell", "w")
d_remote = f_remote.create_dataset("0", (nChannel, nCells), "float32", chunks = (5,120727))
for i in range(nChannel):
    d_remote[i, :] = ds_local[i, :] 
f_remote.close()
In [5]:
#===============================================================================
# API to access remote h5 
#===============================================================================

def read_remote(ds_remote, idx1, idx2):
    """random slicing local H5 (2d)
        ds_remote h5py._hl.dataset.Dataset
        idx1 sorted random idx for dimension 1
        idx2 sorted random idx for dimension 2
    """
    
    #use tupled coordinates to do point selection directly through h5pyd
    coord = [(i,j) for i in idx1 for j in idx2]
    t = time.clock()
    vals2 = ds_remote[coord]
    return time.clock() - t
In [18]:
f_remote = h5pyd.File("/home/wjiang2/tcell", "r")
ds_remote = f_remote["/0"]
In [19]:
ds_remote.shape
Out[19]:
(22, 482911)
In [20]:
ds_remote.chunks
Out[20]:
[5, 120727]
In [21]:
#===============================================================================
# benchmark
#===============================================================================

seed(1)

nX = 2 #fix the number of channels
cells = list(range(10, 10000,1000))#different size of cells

res = np.zeros((3, len(cells)))
for i,nY in enumerate(cells): 
    #generate random idx for x and y
    idx1 = sorted(sample(range(nChannel), nX))
    idx2 = sorted(sample(range(nCells), nY))
    res[0,i] = read_local(ds_local, idx1, idx2, isPointSelection = False)
    res[1,i] = read_local(ds_local, idx1, idx2, isPointSelection = True)
    res[2,i] = read_remote(ds_remote,idx1, idx2)
In [26]:
#===============================================================================
# plot the timing results
#===============================================================================
df = pd.DataFrame(res.transpose(), columns = ['h5 local(non-pt-sel)', 'h5 local' ,'hsds'], index = cells)
plt.figure()
df.plot(style = '.-')
plt.yscale("log")
plt.xlabel("nCells randomly selected")
plt.ylabel("Time (s)")
Out[26]:
<matplotlib.text.Text at 0x7faf672e4908>
<matplotlib.figure.Figure at 0x7faf66fe6160>
In [27]:
df
Out[27]:
h5 local(non-pt-sel) h5 local hsds
10 0.018716 0.006565 0.008854
1010 0.014747 0.135130 0.013849
2010 0.010962 0.549267 0.021959
3010 0.013194 1.198138 0.022833
4010 0.007392 2.112973 0.039661
5010 0.010063 3.599771 0.043981
6010 0.011559 5.511220 0.053650
7010 0.011437 7.704631 0.059908
8010 0.013882 10.001091 0.065915
9010 0.011678 12.592983 0.056122