library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
hShroom3 <- rentrez::entrez_fetch(db = "protein",
id = "NP_065910",
rettype = "fasta")
sShroom <- entrez_fetch(db = "protein",
id = "XP_783573",
rettype = "fasta")
Clean data - showing proper (parse = F) and WRONG (parse = T) formats
hShroom3_cleaned_noparse <- fasta_cleaner(hShroom3, parse = F)
hShroom3_cleaned_parsed <- fasta_cleaner(hShroom3, parse = T)
sShroom_cleaned_noparse <- fasta_cleaner(sShroom, parse = F)
sShroom_cleaned_parsed <- fasta_cleaner(sShroom, parse = T)
For making an alignment, we need data that is in a single long string with no spaces. The vector will therefore have length of 1, the number of characters in it will equal th length of the sequences
length(sShroom_cleaned_noparse)
## [1] 1
nchar(sShroom_cleaned_noparse)
## [1] 1661
str(sShroom_cleaned_noparse)
## chr "MMKDAMYPTTTSTTSSSVNPLPKEVAEQKPVNTKRVRKRESQPGSPRPKSWHTDVRTLSQPDLSRMPQHSRQRHGEQTQPRYRNPPPTQYNKFHSSSDSSFMMSSYEEKTG"| __truncated__
we DO NOT want it like this
length(sShroom_cleaned_parsed)
## [1] 1661
nchar(sShroom_cleaned_parsed)[1:15]
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
str(sShroom_cleaned_parsed)
## chr [1:1661] "M" "M" "K" "D" "A" "M" "Y" "P" "T" "T" "T" "S" "T" "T" "S" ...
use parse = FALSE version!
align.h3.vs.m3a <- Biostrings::pairwiseAlignment(
hShroom3_cleaned_noparse,
sShroom_cleaned_noparse)
Biostrings::pid(align.h3.vs.m3a)
## [1] 25.22826
Default is type = “PID1”
Biostrings::pid(align.h3.vs.m3a, type = "PID1")
## [1] 25.22826
Other options include PID2, PID3 and PID4.