import FASTA files

myFA = read.FASTA('MT103168.fasta')
head(myFA) # summarize significant info

## 1 DNA sequence in binary format stored in a list.
## 
## Sequence length: 1560 
## 
## Label:
## MT103168.1 Bifidobacterium longum strain BB536 cell division...
## 
## Base composition:
##     a     c     g     t 
## 0.156 0.319 0.289 0.236 
## (Total: 1.56 kb)

str(myFA) # show structure of FASTA

## List of 1
##  $ MT103168.1 Bifidobacterium longum strain BB536 cell division protein FtsW (rodA) gene, complete cds: raw [1:1560] 88 18 48 88 ...
##  - attr(*, "class")= chr "DNAbin"

import FASTQ files

myFQ = read.fastq('ERR1072710.fastq')
head(myFQ) # summarize significant info

## 3 DNA sequences in binary format stored in a list.
## 
## Mean sequence length: 183.667 
##    Shortest sequence: 146 
##     Longest sequence: 259 
## 
## Labels:
## ERR1072710.1 10317.000001315_0 length=151
## ERR1072710.2 10317.000001315_1 length=116
## ERR1072710.4 10317.000001315_3 length=151
## 
## Base composition:
##     a     c     g     t 
## 0.318 0.208 0.254 0.219 
## (Total: 551 bases)

str(myFQ) # show structure of FASTQ

## List of 3
##  $ ERR1072710.1 10317.000001315_0 length=151: raw [1:146] 18 18 88 88 ...
##  $ ERR1072710.2 10317.000001315_1 length=116: raw [1:259] 18 28 18 28 ...
##  $ ERR1072710.4 10317.000001315_3 length=151: raw [1:146] 28 28 88 28 ...
##  - attr(*, "class")= chr "DNAbin"
##  - attr(*, "QUAL")=List of 7
##   ..$ ERR1072710.1 10317.000001315_0 length=151: num [1:11] 32 38 51 34 32 34 32 34 32 38 ...
##   ..$ ERR1072710.2 10317.000001315_1 length=116: num [1:11] 30 30 30 30 30 30 30 30 30 30 ...
##   ..$ ERR1072710.4 10317.000001315_3 length=151: num [1:42] 10 36 49 49 16 15 22 17 22 16 ...
##   ..$ NA                                       : num [1:70] 51 32 34 38 38 32 38 38 38 51 ...
##   ..$ NA                                       : num [1:67] 30 30 30 30 30 30 30 30 30 30 ...
##   ..$ NA                                       : num [1:11] 32 51 51 32 38 32 38 34 34 51 ...
##   ..$ NA                                       : num [1:11] 30 30 30 30 30 30 30 30 30 30 ...

import and restructure vcf file

# standard table read; commented-out line 12 in raw file interferes with proper column header structure:
myVCF = read.table('TwoVariants.vcf')
names(myVCF)

##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"

head(myVCF)

##                  V1  V2 V3 V4 V5 V6 V7                   V8             V9
## 1 NZ_BCYL01000006.1  29  .  A  G  .  .  AC=84;AF=1.0;SB=0.0 GT:AC:AF:SB:NC
## 2 NZ_BCYL01000006.1 145  .  A  G  .  . AC=114;AF=1.0;SB=0.0 GT:AC:AF:SB:NC
##                          V10
## 1  1:84:1.0:0.0:+G=37,-G=47,
## 2 1:114:1.0:0.0:+G=42,-G=72,

str(myVCF)

## 'data.frame':    2 obs. of  10 variables:
##  $ V1 : chr  "NZ_BCYL01000006.1" "NZ_BCYL01000006.1"
##  $ V2 : int  29 145
##  $ V3 : chr  "." "."
##  $ V4 : chr  "A" "A"
##  $ V5 : chr  "G" "G"
##  $ V6 : chr  "." "."
##  $ V7 : chr  "." "."
##  $ V8 : chr  "AC=84;AF=1.0;SB=0.0" "AC=114;AF=1.0;SB=0.0"
##  $ V9 : chr  "GT:AC:AF:SB:NC" "GT:AC:AF:SB:NC"
##  $ V10: chr  "1:84:1.0:0.0:+G=37,-G=47," "1:114:1.0:0.0:+G=42,-G=72,"

# preserve headers by reading entire file line by line:
myLINES = read.csv('TwoVariants.vcf', sep='\n')
str(myLINES)

## 'data.frame':    14 obs. of  1 variable:
##  $ X..fileformat.VCFv4.3: chr  "##fileDate=20220331" "##source=Naive Variant Caller version 0.0.4" "##reference=file:///corral4/main/objects/9/1/1/dataset_91189d3d-e03a-47db-b8da-37401d29104e.dat" "##INFO=<ID=AC,Number=A,Type=Integer,Description=Allele count in genotypes, for each ALT allele, in the same order as listed>" ...

extract column names from 12th line of myLINES

# line 12 contains column headers:
headerLine = myLINES[12,1]
print(headerLine)

## [1] "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t__NONE__"

# split tab separated header line into individual column names:
columnNames = strsplit(headerLine, '\t')[[1]]
# remove the pound from the first column name:
columnNames[1] = gsub('^#', '', columnNames[1])

extract lines containing data and create data frame

# extract data from rows; lines 13 & 14 contain data:
dataRows = myLINES[13:nrow(myLINES), 1]
# split these rows by tabs; combine into matrix format:
variantData <- do.call(rbind, lapply(dataRows, function(x) {
  strsplit(x, '\t')[[1]]}
  ))
# convert matrix to data frame and assign column names:
variantData = as.data.frame(variantData, stringsAsFactors = FALSE)
colnames(variantData) <- columnNames
print(variantData)

##               CHROM POS ID REF ALT QUAL FILTER                 INFO
## 1 NZ_BCYL01000006.1  29  .   A   G    .      .  AC=84;AF=1.0;SB=0.0
## 2 NZ_BCYL01000006.1 145  .   A   G    .      . AC=114;AF=1.0;SB=0.0
##           FORMAT                   __NONE__
## 1 GT:AC:AF:SB:NC  1:84:1.0:0.0:+G=37,-G=47,
## 2 GT:AC:AF:SB:NC 1:114:1.0:0.0:+G=42,-G=72,

manually rename the last column

variantData <- rename(variantData, VALUES = '__NONE__')
# double-check final structure:
print(variantData)

##               CHROM POS ID REF ALT QUAL FILTER                 INFO
## 1 NZ_BCYL01000006.1  29  .   A   G    .      .  AC=84;AF=1.0;SB=0.0
## 2 NZ_BCYL01000006.1 145  .   A   G    .      . AC=114;AF=1.0;SB=0.0
##           FORMAT                     VALUES
## 1 GT:AC:AF:SB:NC  1:84:1.0:0.0:+G=37,-G=47,
## 2 GT:AC:AF:SB:NC 1:114:1.0:0.0:+G=42,-G=72,

str(variantData)

## 'data.frame':    2 obs. of  10 variables:
##  $ CHROM : chr  "NZ_BCYL01000006.1" "NZ_BCYL01000006.1"
##  $ POS   : chr  "29" "145"
##  $ ID    : chr  "." "."
##  $ REF   : chr  "A" "A"
##  $ ALT   : chr  "G" "G"
##  $ QUAL  : chr  "." "."
##  $ FILTER: chr  "." "."
##  $ INFO  : chr  "AC=84;AF=1.0;SB=0.0" "AC=114;AF=1.0;SB=0.0"
##  $ FORMAT: chr  "GT:AC:AF:SB:NC" "GT:AC:AF:SB:NC"
##  $ VALUES: chr  "1:84:1.0:0.0:+G=37,-G=47," "1:114:1.0:0.0:+G=42,-G=72,"

data structures for common BIN filetypes

Collin McNeil

2025-08-27

import FASTA files

import FASTQ files

import and restructure vcf file

extract column names from 12th line of myLINES

extract lines containing data and create data frame

manually rename the last column