In [12]:
import h5py
import numpy as np
from random import sample
from random import seed
import time
import pandas as pd
from numpy.matlib import rand,randn
%matplotlib inline
import matplotlib.pyplot as plt
import h5pyd
In [2]:
#===============================================================================
# access local H5
#===============================================================================
f_local = h5py.File("/loc/no-backup/mike/shared/file3322215c73fb.nc", "r")
ds_local = f_local["/0"]
ds_local.shape
nChannel, nCells = ds_local.shape
In [3]:
def read_local(ds_local, idx1, idx2, isPointSelection = True):
    """random slicing local H5 (2d)
        ds_local h5py._hl.dataset.Dataset
        idx1 sorted random idx for dimension 1
        idx2 sorted random idx for dimension 2
        isPointSelection boolean whether do the point selection at H5 level
    """
    #preallocate output memory 
    vals1 = np.zeros((len(idx1), len(idx2)), dtype='float32')
    t1 = time.clock()
    #fix x dim and random slicing on y since h5py doesn't allow random selection on both 
    for i,x in enumerate(idx1):
        if(isPointSelection):
            vals1[i,:] = ds_local[x, idx2]
        else:
            vals1[i,:] = ds_local[x, :][idx2]
    t2 = time.clock() - t1
    return t2
In [4]:
#===============================================================================
# create remote h5
#===============================================================================

# f_remote = h5pyd.File("/home/wjiang2/tcell", "w")
# f_remote.create_group("datasets")
# 
# d_remote = f_remote.create_dataset("datasets/0", (nChannel, nCells), "float32")
# for i in range(nChannel):
#     d_remote[i, :] = ds_local[i, :] 
# f_remote.close()
#     
In [5]:
#===============================================================================
# API to access remote h5 
#===============================================================================

def read_remote(ds_remote, idx1, idx2):
    """random slicing local H5 (2d)
        ds_remote h5py._hl.dataset.Dataset
        idx1 sorted random idx for dimension 1
        idx2 sorted random idx for dimension 2
    """
    
    #use tupled coordinates to do point selection directly through h5pyd
    coord = [(i,j) for i in idx1 for j in idx2]
    t = time.clock()
    vals2 = ds_remote[coord]
    return time.clock() - t
In [6]:
f_remote = h5pyd.File("/home/wjiang2/tcell", "r")
ds_remote = f_remote["/datasets/0"]
In [7]:
ds_remote
Out[7]:
<HDF5 dataset "0": shape (15, 76427), type "<f4">
In [8]:
#===============================================================================
# benchmark
#===============================================================================

seed(1)

nX = 2 #fix the number of channels
cells = list(range(10, 10000,1000))#different size of cells

res = np.zeros((3, len(cells)))
for i,nY in enumerate(cells): 
    #generate random idx for x and y
    idx1 = sorted(sample(range(nChannel), nX))
    idx2 = sorted(sample(range(nCells), nY))
    res[0,i] = read_local(ds_local, idx1, idx2, isPointSelection = False)
    res[1,i] = read_local(ds_local, idx1, idx2, isPointSelection = True)
    res[2,i] = read_remote(ds_remote,idx1, idx2)
In [13]:
#===============================================================================
# plot the timing results
#===============================================================================
df = pd.DataFrame(res.transpose(), columns = ['h5 local', 'h5 local(non-pt-sel)' ,'hsds'], index = cells)
plt.figure()
df.plot(style = '.-')
plt.yscale("log")
<matplotlib.figure.Figure at 0x7faf78f8de48>