This guide script shows you how to read in the .tabular file from Galaxy BLAST and simplify it to only contain two columns of data.
#Project Step 2_11 read in and view structure
myBLAST <- read.table('Galaxy21-[blastn_Filter_FASTA_on_data_16__FASTA_sequences_vs___17apr2014-nt__].tabular',sep='\t')
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## EOF within quoted string
str(myBLAST)
## 'data.frame': 72243 obs. of 25 variables:
## $ V1 : chr "NODE_496_length_350_cov_1.44407" "NODE_496_length_350_cov_1.44407" "NODE_496_length_350_cov_1.44407" "NODE_496_length_350_cov_1.44407" ...
## $ V2 : chr "CP003229.1" "FQ859184.1" "CP001635.1" "CP006958.1" ...
## $ V3 : num 71 71 70.6 100 100 ...
## $ V4 : int 186 186 187 36 36 169 34 34 34 124 ...
## $ V5 : int 48 48 44 0 0 50 0 0 0 30 ...
## $ V6 : int 5 5 6 0 0 2 0 0 0 3 ...
## $ V7 : int 95 95 95 9 10 107 10 10 10 173 ...
## $ V8 : int 277 277 275 44 45 274 43 43 43 294 ...
## $ V9 : int 34737 1778555 4554275 6057301 42 391082 1039 178 50 4704304 ...
## $ V10: int 34919 1778373 4554456 6057336 7 391249 1006 145 17 4704183 ...
## $ V11: num 4.02e-10 4.02e-10 1.40e-09 2.08e-07 2.08e-07 7.27e-07 2.54e-06 2.54e-06 2.54e-06 8.86e-06 ...
## $ V12: num 76.1 76.1 73.4 66.2 66.2 64.4 62.6 62.6 62.6 61.7 ...
## $ V13: chr "gi|365809978|gb|CP003229.1|" "gi|337762320|emb|FQ859184.1|" "gi|239799596|gb|CP001635.1|" "gi|566048795|gb|CP006958.1|" ...
## $ V14: int 83 83 80 72 72 70 68 68 68 67 ...
## $ V15: int 132 132 132 36 36 117 34 34 34 90 ...
## $ V16: int 132 132 132 36 36 117 34 34 34 90 ...
## $ V17: int 6 6 11 0 0 2 0 0 0 4 ...
## $ V18: num 71 71 70.6 100 100 ...
## $ V19: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V20: int 1 -1 1 1 -1 1 -1 -1 -1 -1 ...
## $ V21: chr "GTGCGCGTACACAACACCAATACCGGTAAGATCCT--TCGAATGAAGATCCCGACCAAGGGCGGGCGGCCGCAGGTCGAAGGCGACTTCGCCATCGATGGTGTGCCCGGGA"| __truncated__ "GTGCGCGTACACAACACCAATACCGGTAAGATCCT--TCGAATGAAGATCCCGACCAAGGGCGGGCGGCCGCAGGTCGAAGGCGACTTCGCCATCGATGGTGTGCCCGGGA"| __truncated__ "GTGCGCGTACACAACACCAATACCGGTAAGATCCTTCGAATGAAGATCCCGACCAAGGGCG---GGCG-GCCGCAGGTCGAAGGCGACTTCGCCATCGATGGTGTGCCCGG"| __truncated__ "CGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTG" ...
## $ V22: chr "GTCCGCGTGCACAACACCAACACCGGCACGATCATCGTCGCC-GACG-TCCCGGTCCGCGACGGCCGGGCCGAGGTGGCGGGCGACTTCGCCCTCCCCGGTGTGCCGGGCA"| __truncated__ "GTCCGCGTGCACAACACCAACACCGGCACGATCATCGTCGCC-GACG-TCCCGGTCCGCGACGGCCGGGCCGAGGTGGCGGGCGACTTCGCCCTCCCCGGTGTGCCGGGCA"| __truncated__ "GTGCGAGTGCACAACACCAACACCGGCAAGATTCTGCGCATGGTGGT----GCAATGTGCGCACGGCGAGCCGCGCGTGGAAGGCGACACCGAAGTGGATGGGGTGCCTGC"| __truncated__ "CGTGACTGGAGTTCAGACGTGTGCTCTTCCGATCTG" ...
## $ V23: int 350 350 350 350 350 350 350 350 350 350 ...
## $ V24: int 1812548 1809491 5626353 6683584 9848 1126962 1209 1201 9902 5566749 ...
## $ V25: chr "Streptomyces cattleya DSM 46488 plasmid pSCATT, complete genome" "Streptomyces cattleya NRRL 8057 plasmid pSCAT, complete genome" "Variovorax paradoxus S110 chromosome 1, complete sequence" "Achromobacter xylosoxidans NBRC 15126 = ATCC 27061, complete genome" ...
#Project Step2_12
#Column slice to only keep percent identity and species
mySpecies <- myBLAST[,c(3,25)]
#Row slice to remove Uncultured matches
mySpecies <- mySpecies[!grepl("Uncultured", mySpecies[["V25"]]),]
#Rename columns
names(mySpecies)<-c('PercentIdentity','Species')
write.csv(mySpecies,'BLAST.csv',row.names = FALSE, quote=FALSE)
#Project 2_13
#load stringr package for the str_split function
library(stringr)
#create dataframe with Genus, Species, and Strain as different columns
newColumns<-as.data.frame(str_split_fixed(mySpecies$Species, " ", 3))
#Paste Genus and Species back together
simplifiedSpecies <- paste(newColumns$V1,newColumns$V2)
#Create final data frame with Percent Identity and Simple Species names
myParsed <- as.data.frame(cbind(mySpecies$PercentIdentity,simplifiedSpecies))
names(myParsed)<-c('PercentIdentity','Species')
#Project 2_14
myUnique <- unique(myParsed$Species)
length(myUnique)
## [1] 1782