ncdf R packageThe NetCDF is a set of software libraries for machine-independent data formats. Go to Website
The ncdf R package uses the NetCDF or Network Common Data Form for storing flat files for quick access.
This tutorial is based on the “Introduction to Handling Large Data” presentation by Ken Rice and Thomas Lumley. Click here for the slides.
Note: Linux distribution users need to install the libnetcdf-dev and netcdf-bin libraries.
sudo apt-get install libnetcdf-dev netcdf-bin
sudo apt-get install netcdf-bin
BiocInstaller::biocLite("ncdf")
library(ncdf)
nc <- open.ncdf("hapmap.nc")
Here, -1 means all values in the vector
chromosome <- get.var.ncdf(nc, "chr", start = 1, count = -1)
runs <- vector("list", nsamples)
for (i in 1:nsamples){
## read all genotypes for one person
genotypes <- get.var.ncdf(nc, "geno", start = c(1, i), count = c(-1, 1))
## zero for heterozygous, chromosome number for homozygous
hmzygous <- genotypes != 1
hmzygous <- as.vector(hmzygous*chromosome)
## consecutive runs of some value
r <- rle(hmzygous)
begin <- cumsum(c(1, r$lengths))
end <- cumsum(r$lengths)
long <- which(r$lengths > 250 $ r$values != 0)
runs[[i]] <- cbind(begin[long], end[long], r$lengths[long])
}
close.ncdf(nc)
snpdim <- dim.def.ncdf("position", "bases", positions)
sampledim <- dim.def.ncdf("seqnum", "count", 1:10, unlim = TRUE)
varChrm <- var.def.ncdf("chr", "count", dim = snpdim, missval = -1, prec = "byte")
varSNP <- var.def.ncdf("SNP","rs",dim=snpdim, missval=-1, prec="integer")
vargeno <- var.def.ncdf("geno","base",dim=list(snpdim, sampledim), missval=-1, prec="byte")
vartheta <- var.def.ncdf("theta","deg",dim=list(snpdim, sampledim), missval=-1, prec="double")
varr <- var.def.ncdf("r","copies",dim=list(snpdim, sampledim), missval=-1, prec="double")
genofile <- create.ncdf("hapmap.nc", list(varChrm, varSNP, vargeno, vartheta, varr))
Note: File is empty when created. Write data to file using put.var.ncdf().
for(i in 1:4000){
temp.geno.data <- readRawData(i) ## somehow
put.var.ncdf(genofile, "geno", temp.geno.data,
start=c(1,i), count=c(-1,1))
}