#Load Required Package
#Part 1: FASTA Imports
#Read a FASTA file using read.FASTA
##myFA is a list where each element represents one sequence
##Each element is a character containing nucleotide bases
myFA <- read.FASTA ('MT103168.fasta') #Generates a binary list
#Preview the data
head(myFA)
## 1 DNA sequence in binary format stored in a list.
##
## Sequence length: 1560
##
## Label:
## MT103168.1 Bifidobacterium longum strain BB536 cell division...
##
## Base composition:
## a c g t
## 0.156 0.319 0.289 0.236
## (Total: 1.56 kb)
#Examine the structure of the object.
#Shows that it is a list of 1 and gives a preview of the data
str(myFA)
## List of 1
## $ MT103168.1 Bifidobacterium longum strain BB536 cell division protein FtsW (rodA) gene, complete cds: raw [1:1560] 88 18 48 88 ...
## - attr(*, "class")= chr "DNAbin"
#Part 2: FASTQ Imports
#Read a FASTQ file using read.FASTQ from ape package
##myFQ is a list where each element corresponds to a sequence
##Each element is a character of bases
myFQ <- read.fastq('ERR1072710.fastq') #Generates a binary list
#Previews the sequences
head(myFQ)
## 3 DNA sequences in binary format stored in a list.
##
## Mean sequence length: 183.667
## Shortest sequence: 146
## Longest sequence: 259
##
## Labels:
## ERR1072710.1 10317.000001315_0 length=151
## ERR1072710.2 10317.000001315_1 length=116
## ERR1072710.4 10317.000001315_3 length=151
##
## Base composition:
## a c g t
## 0.318 0.208 0.254 0.219
## (Total: 551 bases)
#Examine the structure
str(myFQ)
## List of 3
## $ ERR1072710.1 10317.000001315_0 length=151: raw [1:146] 18 18 88 88 ...
## $ ERR1072710.2 10317.000001315_1 length=116: raw [1:259] 18 28 18 28 ...
## $ ERR1072710.4 10317.000001315_3 length=151: raw [1:146] 28 28 88 28 ...
## - attr(*, "class")= chr "DNAbin"
## - attr(*, "QUAL")=List of 7
## ..$ ERR1072710.1 10317.000001315_0 length=151: num [1:11] 32 38 51 34 32 34 32 34 32 38 ...
## ..$ ERR1072710.2 10317.000001315_1 length=116: num [1:11] 30 30 30 30 30 30 30 30 30 30 ...
## ..$ ERR1072710.4 10317.000001315_3 length=151: num [1:42] 10 36 49 49 16 15 22 17 22 16 ...
## ..$ NA : num [1:70] 51 32 34 38 38 32 38 38 38 51 ...
## ..$ NA : num [1:67] 30 30 30 30 30 30 30 30 30 30 ...
## ..$ NA : num [1:11] 32 51 51 32 38 32 38 34 34 51 ...
## ..$ NA : num [1:11] 30 30 30 30 30 30 30 30 30 30 ...
#Part 3: vcf Imports
#Method 1: Read the VCF file as a table (ignores comented metadata line)
myVCF <- read.table('TwoVariants.vcf')
#Methods 1: View column names and data
names(myVCF) #<-
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
head(myVCF)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 NZ_BCYL01000006.1 29 . A G . . AC=84;AF=1.0;SB=0.0 GT:AC:AF:SB:NC
## 2 NZ_BCYL01000006.1 145 . A G . . AC=114;AF=1.0;SB=0.0 GT:AC:AF:SB:NC
## V10
## 1 1:84:1.0:0.0:+G=37,-G=47,
## 2 1:114:1.0:0.0:+G=42,-G=72,
str(myVCF)
## 'data.frame': 2 obs. of 10 variables:
## $ V1 : chr "NZ_BCYL01000006.1" "NZ_BCYL01000006.1"
## $ V2 : int 29 145
## $ V3 : chr "." "."
## $ V4 : chr "A" "A"
## $ V5 : chr "G" "G"
## $ V6 : chr "." "."
## $ V7 : chr "." "."
## $ V8 : chr "AC=84;AF=1.0;SB=0.0" "AC=114;AF=1.0;SB=0.0"
## $ V9 : chr "GT:AC:AF:SB:NC" "GT:AC:AF:SB:NC"
## $ V10: chr "1:84:1.0:0.0:+G=37,-G=47," "1:114:1.0:0.0:+G=42,-G=72,"
#Method 2: Read the full VCF includig comments lines (as a plain text)
myLINES <- read.csv('TwoVariants.vcf', sep="\n") #Keeps commented lines
#View a few lines of raw file
head(myLINES)
## X..fileformat.VCFv4.3
## 1 ##fileDate=20220331
## 2 ##source=Naive Variant Caller version 0.0.4
## 3 ##reference=file:///corral4/main/objects/9/1/1/dataset_91189d3d-e03a-47db-b8da-37401d29104e.dat
## 4 ##INFO=<ID=AC,Number=A,Type=Integer,Description=Allele count in genotypes, for each ALT allele, in the same order as listed>
## 5 ##INFO=<ID=AF,Number=A,Type=Float,Description=Allele Frequency, for each ALT allele, in the same order as listed>
## 6 ##INFO=<ID=SB,Number=1,Type=Float,Description=Strand Bias>
#Method 1
# - Columns correspond to chromosome, position, ID, etc.
# - Data types include a mixture of characters, numeric, etc.
#Method 2
# - 'data.frame' with one column
# - Each row is a line from the original VCF file
# - Metadata is included with lines starting with "##"