ParseBLAST

R Markdown

This guide script shows you how to read in the .tabular file from Galaxy BLAST and simplify it to only contain two columns of data.

#Project Step 2_11 read in and view structure
myBLAST <- read.table('Galaxy21-[blastn_Filter_FASTA_on_data_16__FASTA_sequences_vs___17apr2014-nt__].tabular',sep='\t')

## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string

str(myBLAST)

## 'data.frame':    72243 obs. of  25 variables:
##  $ V1 : chr  "NODE_496_length_350_cov_1.44407" "NODE_496_length_350_cov_1.44407" "NODE_496_length_350_cov_1.44407" "NODE_496_length_350_cov_1.44407" ...
##  $ V2 : chr  "CP003229.1" "FQ859184.1" "CP001635.1" "CP006958.1" ...
##  $ V3 : num  71 71 70.6 100 100 ...
##  $ V4 : int  186 186 187 36 36 169 34 34 34 124 ...
##  $ V5 : int  48 48 44 0 0 50 0 0 0 30 ...
##  $ V6 : int  5 5 6 0 0 2 0 0 0 3 ...
##  $ V7 : int  95 95 95 9 10 107 10 10 10 173 ...
##  $ V8 : int  277 277 275 44 45 274 43 43 43 294 ...
##  $ V9 : int  34737 1778555 4554275 6057301 42 391082 1039 178 50 4704304 ...
##  $ V10: int  34919 1778373 4554456 6057336 7 391249 1006 145 17 4704183 ...
##  $ V11: num  4.02e-10 4.02e-10 1.40e-09 2.08e-07 2.08e-07 7.27e-07 2.54e-06 2.54e-06 2.54e-06 8.86e-06 ...
##  $ V12: num  76.1 76.1 73.4 66.2 66.2 64.4 62.6 62.6 62.6 61.7 ...
##  $ V13: chr  "gi|365809978|gb|CP003229.1|" "gi|337762320|emb|FQ859184.1|" "gi|239799596|gb|CP001635.1|" "gi|566048795|gb|CP006958.1|" ...
##  $ V14: int  83 83 80 72 72 70 68 68 68 67 ...
##  $ V15: int  132 132 132 36 36 117 34 34 34 90 ...
##  $ V16: int  132 132 132 36 36 117 34 34 34 90 ...
##  $ V17: int  6 6 11 0 0 2 0 0 0 4 ...
##  $ V18: num  71 71 70.6 100 100 ...
##  $ V19: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ V20: int  1 -1 1 1 -1 1 -1 -1 -1 -1 ...
##  $ V21: chr  "GTGCGCGTACACAACACCAATACCGGTAAGATCCT--TCGAATGAAGATCCCGACCAAGGGCGGGCGGCCGCAGGTCGAAGGCGACTTCGCCATCGATGGTGTGCCCGGGA"| __truncated__ "GTGCGCGTACACAACACCAATACCGGTAAGATCCT--TCGAATGAAGATCCCGACCAAGGGCGGGCGGCCGCAGGTCGAAGGCGACTTCGCCATCGATGGTGTGCCCGGGA"| __truncated__ "GTGCGCGTACACAACACCAATACCGGTAAGATCCTTCGAATGAAGATCCCGACCAAGGGCG---GGCG-GCCGCAGGTCGAAGGCGACTTCGCCATCGATGGTGTGCCCGG"| __truncated__ "CGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTG" ...
##  $ V22: chr  "GTCCGCGTGCACAACACCAACACCGGCACGATCATCGTCGCC-GACG-TCCCGGTCCGCGACGGCCGGGCCGAGGTGGCGGGCGACTTCGCCCTCCCCGGTGTGCCGGGCA"| __truncated__ "GTCCGCGTGCACAACACCAACACCGGCACGATCATCGTCGCC-GACG-TCCCGGTCCGCGACGGCCGGGCCGAGGTGGCGGGCGACTTCGCCCTCCCCGGTGTGCCGGGCA"| __truncated__ "GTGCGAGTGCACAACACCAACACCGGCAAGATTCTGCGCATGGTGGT----GCAATGTGCGCACGGCGAGCCGCGCGTGGAAGGCGACACCGAAGTGGATGGGGTGCCTGC"| __truncated__ "CGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTG" ...
##  $ V23: int  350 350 350 350 350 350 350 350 350 350 ...
##  $ V24: int  1812548 1809491 5626353 6683584 9848 1126962 1209 1201 9902 5566749 ...
##  $ V25: chr  "Streptomyces cattleya DSM 46488 plasmid pSCATT, complete genome" "Streptomyces cattleya NRRL 8057 plasmid pSCAT, complete genome" "Variovorax paradoxus S110 chromosome 1, complete sequence" "Achromobacter xylosoxidans NBRC 15126 = ATCC 27061, complete genome" ...

#Project Step2_12
#Column slice to only keep percent identity and species
mySpecies <- myBLAST[,c(3,25)]

#Row slice to remove Uncultured matches
mySpecies <- mySpecies[!grepl("Uncultured", mySpecies[["V25"]]),]

#Rename columns
names(mySpecies)<-c('PercentIdentity','Species')
write.csv(mySpecies,'BLAST.csv',row.names = FALSE, quote=FALSE)

#Project 2_13
#load stringr package for the str_split function
library(stringr)
#create dataframe with Genus, Species, and Strain as different columns
newColumns<-as.data.frame(str_split_fixed(mySpecies$Species, " ", 3))
#Paste Genus and Species back together
simplifiedSpecies <- paste(newColumns$V1,newColumns$V2)
#Create final data frame with Percent Identity and Simple Species names
myParsed <- as.data.frame(cbind(mySpecies$PercentIdentity,simplifiedSpecies))
names(myParsed)<-c('PercentIdentity','Species')

#Project 2_14
myUnique <- unique(myParsed$Species)
length(myUnique)

## [1] 1782