The ncdf R package

The NetCDF is a set of software libraries for machine-independent data formats. Go to Website

The ncdf R package uses the NetCDF or Network Common Data Form for storing flat files for quick access.

This tutorial is based on the “Introduction to Handling Large Data” presentation by Ken Rice and Thomas Lumley. Click here for the slides.

Note: Linux distribution users need to install the libnetcdf-dev and netcdf-bin libraries.

sudo apt-get install libnetcdf-dev netcdf-bin
sudo apt-get install netcdf-bin

Reading a netCDF database file

BiocInstaller::biocLite("ncdf")
library(ncdf)
nc <- open.ncdf("hapmap.nc")

Read all of chromosome variable

Here, -1 means all values in the vector

chromosome <- get.var.ncdf(nc, "chr", start = 1, count = -1)

Create a list for results

runs <- vector("list", nsamples)

for (i in 1:nsamples){
  ## read all genotypes for one person
  genotypes <- get.var.ncdf(nc, "geno", start = c(1, i), count = c(-1, 1))
  ## zero for heterozygous, chromosome number for homozygous
  hmzygous <- genotypes != 1
  hmzygous <- as.vector(hmzygous*chromosome)
  ## consecutive runs of some value
  r <- rle(hmzygous)
  begin <- cumsum(c(1, r$lengths))
  end <- cumsum(r$lengths)
  long <- which(r$lengths > 250 $ r$values != 0)
  runs[[i]] <- cbind(begin[long], end[long], r$lengths[long])
}

It’s always a good idea to close the connection to server (in this case, local file)

close.ncdf(nc)

Creating netCDF files

  1. Define dimensions
  2. Define variables and specify which dimensions they use
  3. Create an empty file
  4. Write data to the file
snpdim <- dim.def.ncdf("position", "bases", positions)
sampledim <- dim.def.ncdf("seqnum", "count", 1:10, unlim = TRUE)
varChrm <- var.def.ncdf("chr", "count", dim = snpdim, missval = -1, prec = "byte")
varSNP <- var.def.ncdf("SNP","rs",dim=snpdim, missval=-1, prec="integer")
vargeno <- var.def.ncdf("geno","base",dim=list(snpdim, sampledim), missval=-1, prec="byte")
vartheta <- var.def.ncdf("theta","deg",dim=list(snpdim, sampledim), missval=-1, prec="double")
varr <- var.def.ncdf("r","copies",dim=list(snpdim, sampledim), missval=-1, prec="double")

Create the .nc file

genofile <- create.ncdf("hapmap.nc", list(varChrm, varSNP, vargeno, vartheta, varr))

Note: File is empty when created. Write data to file using put.var.ncdf().

for(i in 1:4000){
temp.geno.data <- readRawData(i) ## somehow
put.var.ncdf(genofile, "geno", temp.geno.data,
start=c(1,i), count=c(-1,1))
}