import FASTA files

myFA = read.FASTA('MT103168.fasta')
head(myFA) # summarize significant info
## 1 DNA sequence in binary format stored in a list.
## 
## Sequence length: 1560 
## 
## Label:
## MT103168.1 Bifidobacterium longum strain BB536 cell division...
## 
## Base composition:
##     a     c     g     t 
## 0.156 0.319 0.289 0.236 
## (Total: 1.56 kb)
str(myFA) # show structure of FASTA
## List of 1
##  $ MT103168.1 Bifidobacterium longum strain BB536 cell division protein FtsW (rodA) gene, complete cds: raw [1:1560] 88 18 48 88 ...
##  - attr(*, "class")= chr "DNAbin"

import FASTQ files

myFQ = read.fastq('ERR1072710.fastq')
head(myFQ) # summarize significant info
## 3 DNA sequences in binary format stored in a list.
## 
## Mean sequence length: 183.667 
##    Shortest sequence: 146 
##     Longest sequence: 259 
## 
## Labels:
## ERR1072710.1 10317.000001315_0 length=151
## ERR1072710.2 10317.000001315_1 length=116
## ERR1072710.4 10317.000001315_3 length=151
## 
## Base composition:
##     a     c     g     t 
## 0.318 0.208 0.254 0.219 
## (Total: 551 bases)
str(myFQ) # show structure of FASTQ
## List of 3
##  $ ERR1072710.1 10317.000001315_0 length=151: raw [1:146] 18 18 88 88 ...
##  $ ERR1072710.2 10317.000001315_1 length=116: raw [1:259] 18 28 18 28 ...
##  $ ERR1072710.4 10317.000001315_3 length=151: raw [1:146] 28 28 88 28 ...
##  - attr(*, "class")= chr "DNAbin"
##  - attr(*, "QUAL")=List of 7
##   ..$ ERR1072710.1 10317.000001315_0 length=151: num [1:11] 32 38 51 34 32 34 32 34 32 38 ...
##   ..$ ERR1072710.2 10317.000001315_1 length=116: num [1:11] 30 30 30 30 30 30 30 30 30 30 ...
##   ..$ ERR1072710.4 10317.000001315_3 length=151: num [1:42] 10 36 49 49 16 15 22 17 22 16 ...
##   ..$ NA                                       : num [1:70] 51 32 34 38 38 32 38 38 38 51 ...
##   ..$ NA                                       : num [1:67] 30 30 30 30 30 30 30 30 30 30 ...
##   ..$ NA                                       : num [1:11] 32 51 51 32 38 32 38 34 34 51 ...
##   ..$ NA                                       : num [1:11] 30 30 30 30 30 30 30 30 30 30 ...

import and restructure vcf file

# standard table read; commented-out line 12 in raw file interferes with proper column header structure:
myVCF = read.table('TwoVariants.vcf')
names(myVCF)
##  [1] "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"
head(myVCF)
##                  V1  V2 V3 V4 V5 V6 V7                   V8             V9
## 1 NZ_BCYL01000006.1  29  .  A  G  .  .  AC=84;AF=1.0;SB=0.0 GT:AC:AF:SB:NC
## 2 NZ_BCYL01000006.1 145  .  A  G  .  . AC=114;AF=1.0;SB=0.0 GT:AC:AF:SB:NC
##                          V10
## 1  1:84:1.0:0.0:+G=37,-G=47,
## 2 1:114:1.0:0.0:+G=42,-G=72,
str(myVCF)
## 'data.frame':    2 obs. of  10 variables:
##  $ V1 : chr  "NZ_BCYL01000006.1" "NZ_BCYL01000006.1"
##  $ V2 : int  29 145
##  $ V3 : chr  "." "."
##  $ V4 : chr  "A" "A"
##  $ V5 : chr  "G" "G"
##  $ V6 : chr  "." "."
##  $ V7 : chr  "." "."
##  $ V8 : chr  "AC=84;AF=1.0;SB=0.0" "AC=114;AF=1.0;SB=0.0"
##  $ V9 : chr  "GT:AC:AF:SB:NC" "GT:AC:AF:SB:NC"
##  $ V10: chr  "1:84:1.0:0.0:+G=37,-G=47," "1:114:1.0:0.0:+G=42,-G=72,"
# preserve headers by reading entire file line by line:
myLINES = read.csv('TwoVariants.vcf', sep='\n')
str(myLINES)
## 'data.frame':    14 obs. of  1 variable:
##  $ X..fileformat.VCFv4.3: chr  "##fileDate=20220331" "##source=Naive Variant Caller version 0.0.4" "##reference=file:///corral4/main/objects/9/1/1/dataset_91189d3d-e03a-47db-b8da-37401d29104e.dat" "##INFO=<ID=AC,Number=A,Type=Integer,Description=Allele count in genotypes, for each ALT allele, in the same order as listed>" ...

extract column names from 12th line of myLINES

# line 12 contains column headers:
headerLine = myLINES[12,1]
print(headerLine)
## [1] "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t__NONE__"
# split tab separated header line into individual column names:
columnNames = strsplit(headerLine, '\t')[[1]]
# remove the pound from the first column name:
columnNames[1] = gsub('^#', '', columnNames[1])

extract lines containing data and create data frame

# extract data from rows; lines 13 & 14 contain data:
dataRows = myLINES[13:nrow(myLINES), 1]
# split these rows by tabs; combine into matrix format:
variantData <- do.call(rbind, lapply(dataRows, function(x) {
  strsplit(x, '\t')[[1]]}
  ))
# convert matrix to data frame and assign column names:
variantData = as.data.frame(variantData, stringsAsFactors = FALSE)
colnames(variantData) <- columnNames
print(variantData)
##               CHROM POS ID REF ALT QUAL FILTER                 INFO
## 1 NZ_BCYL01000006.1  29  .   A   G    .      .  AC=84;AF=1.0;SB=0.0
## 2 NZ_BCYL01000006.1 145  .   A   G    .      . AC=114;AF=1.0;SB=0.0
##           FORMAT                   __NONE__
## 1 GT:AC:AF:SB:NC  1:84:1.0:0.0:+G=37,-G=47,
## 2 GT:AC:AF:SB:NC 1:114:1.0:0.0:+G=42,-G=72,

manually rename the last column

variantData <- rename(variantData, VALUES = '__NONE__')
# double-check final structure:
print(variantData)
##               CHROM POS ID REF ALT QUAL FILTER                 INFO
## 1 NZ_BCYL01000006.1  29  .   A   G    .      .  AC=84;AF=1.0;SB=0.0
## 2 NZ_BCYL01000006.1 145  .   A   G    .      . AC=114;AF=1.0;SB=0.0
##           FORMAT                     VALUES
## 1 GT:AC:AF:SB:NC  1:84:1.0:0.0:+G=37,-G=47,
## 2 GT:AC:AF:SB:NC 1:114:1.0:0.0:+G=42,-G=72,
str(variantData)
## 'data.frame':    2 obs. of  10 variables:
##  $ CHROM : chr  "NZ_BCYL01000006.1" "NZ_BCYL01000006.1"
##  $ POS   : chr  "29" "145"
##  $ ID    : chr  "." "."
##  $ REF   : chr  "A" "A"
##  $ ALT   : chr  "G" "G"
##  $ QUAL  : chr  "." "."
##  $ FILTER: chr  "." "."
##  $ INFO  : chr  "AC=84;AF=1.0;SB=0.0" "AC=114;AF=1.0;SB=0.0"
##  $ FORMAT: chr  "GT:AC:AF:SB:NC" "GT:AC:AF:SB:NC"
##  $ VALUES: chr  "1:84:1.0:0.0:+G=37,-G=47," "1:114:1.0:0.0:+G=42,-G=72,"