The goal of this exercise is to make you familiar with how to download data from Google Sheets and to briefly review some key concepts R functions and coding concepts.
We’ll do the following things
## Google sheets download package
# install.packages("googlesheets4")
library(googlesheets4)
# comp bio packages
library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
web address (URL) for the spreadsheet with the data
spreadsheet_sp <- "https://docs.google.com/spreadsheets/d/1spC_ZA3_cVuvU3e_Jfcj2nEIfzp-vaP7SA5f-qwQ1pg/edit?usp=sharing"
makes sure we tell the package we aren’t interested in checking user access credentials / authorization
# be sure to run this!
googlesheets4::gs4_deauth() # <====== MUST RUN THIS
Third, we download our data.
“Error in curl::curl_fetch_memory(url, handle = handle) : Error in the HTTP2 framing layer”
If that happens, just re-run the code.
# I include this again in case you missed is the first time : )
googlesheets4::gs4_deauth()
# download
## NOTE: if you get an error, just run the code again
refseq_column <- read_sheet(ss = spreadsheet_sp, # the url
sheet = "RefSeq_prot", # the name of the worksheet
range = "selenoprot!H1:H364",
col_names = TRUE,
na = "", # fill in empty spaces "" w/NA
trim_ws = TRUE)
## ✓ Reading from "human_gene_table".
## ✓ Range ''selenoprot'!H1:H364'.
## NOTE: if you get an error, just run the code again
# for reasons we won't get into I'm going to do this
protein_refseq <- refseq_column$RefSeq_prot
snapshot of the results
protein_refseq[1:10]
## [1] "NP_000783.2" "NP_998758.1" "NP_001034804.1" "NP_001034805.1"
## [5] "NP_001311245.1" NA NA "NP_054644.1"
## [9] "NP_001353425.1" "NP_000784.3"
another column of results
# download
## NOTE: if you get an error, just run the code again
gene_name_column <- read_sheet(ss = spreadsheet_sp, # the url
sheet = "gene", # the name of the worksheet
range = "selenoprot!A1:A364",
col_names = TRUE,
na = "", # fill in empty spaces "" w/NA
trim_ws = TRUE)
## ✓ Reading from "human_gene_table".
## ✓ Range ''selenoprot'!A1:A364'.
## NOTE: if you get an error, just run the code again
# for reasons we won't get into I'm going to do this
gene <- gene_name_column$gene
downloaded a vector of character data
is(protein_refseq)
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
class(protein_refseq)
## [1] "character"
length(protein_refseq)
## [1] 363
protein_refseq[1:10]
## [1] "NP_000783.2" "NP_998758.1" "NP_001034804.1" "NP_001034805.1"
## [5] "NP_001311245.1" NA NA "NP_054644.1"
## [9] "NP_001353425.1" "NP_000784.3"
checks for the presence of NAs using
is.na(protein_refseq)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [13] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [37] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [73] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [253] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [289] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE
counting them up a couple different ways
table(is.na(protein_refseq))
##
## FALSE TRUE
## 334 29
counts them up in a different way
# ...
temp <- is.na(protein_refseq)
# ....
protein_refseq[temp]
## [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA
temp2 <- protein_refseq[temp]
# ...
length(temp2)
## [1] 29
tores data tables
seleno_df <- data.frame(gene = gene,
protein_refseq = protein_refseq)
WHAT’S THIS DOING? makes list of vectors
summary(seleno_df)
## gene protein_refseq
## Length:363 Length:363
## Class :character Class :character
## Mode :character Mode :character
head(seleno_df)
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 2 DIO1 NP_998758.1
## 3 DIO1 NP_001034804.1
## 4 DIO1 NP_001034805.1
## 5 DIO1 NP_001311245.1
## 6 DIO1 <NA>
quickly yanks out all of the NAs
# omit NAs
seleno_df_noNA <- na.omit(seleno_df)
# check length- should be shorter
dim(seleno_df)
## [1] 363 2
dim(seleno_df_noNA)
## [1] 334 2
The same gene can appear multiple times because multiple isoforms are listed.
head(seleno_df_noNA)
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 2 DIO1 NP_998758.1
## 3 DIO1 NP_001034804.1
## 4 DIO1 NP_001034805.1
## 5 DIO1 NP_001311245.1
## 8 DIO2 NP_054644.1
selects just one row for each gene
genes_unique <- unique(seleno_df_noNA$gene)
length(genes_unique)
## [1] 37
genes_unique
## [1] "DIO1" "DIO2" "DIO3" "GPX1" "GPX2" "GPX3"
## [7] "GPX4" "GPX6" "MSRB1" "SELENOF" "SELENOH" "SELENOI"
## [13] "SELENOK" "SELENOM" "SELENON" "SELENOO" "SELENOP" "SELENOS"
## [19] "SELENOT" "SELENOV" "SELENOW" "SEPHS2" "TXNRD1" "TXNRD2"
## [25] "TXNRD3" "SELENOP1" "SELENOP2" "SELENOU" "SELENOW1" "SELENOW2"
## [31] "SELENOE" "SELENOJ" "SELENOL" "SELENOO1" "SELENOO2" "SELENOT1"
## [37] "SELENOT2"
unique() just gives us the unique elements. A related function, duplicated(), gives us the location of duplicated elements in the vector. FALSE means “not duplicated yet” or “first instance so far”.
i.dups <- duplicated(seleno_df_noNA$gene)
We can remove the duplicates using a form of reverse indexing where the “!” means “not”. (You don’t need to know this for the exam)
seleno_df_noNA[!i.dups, ]
## gene protein_refseq
## 1 DIO1 NP_000783.2
## 8 DIO2 NP_054644.1
## 14 DIO3 NP_001353.4
## 15 GPX1 NP_000572.2
## 20 GPX2 NP_002074.2
## 24 GPX3 NP_002075.2
## 26 GPX4 NP_002076.2
## 29 GPX6 NP_874360.1
## 30 MSRB1 NP_057416.1
## 31 SELENOF NP_004252.2
## 35 SELENOH NP_734467.1
## 37 SELENOI NP_277040.1
## 39 SELENOK NP_067060.2
## 40 SELENOM NP_536355.1
## 41 SELENON NP_996809.1
## 43 SELENOO NP_113642.1
## 44 SELENOP NP_005401.3
## 47 SELENOS NP_060915.2
## 49 SELENOT NP_057359.2
## 50 SELENOV NP_874363.1
## 53 SELENOW NP_003000.1
## 54 SEPHS2 NP_036380.2
## 55 TXNRD1 NP_877393.1
## 62 TXNRD2 NP_006431.2
## 69 TXNRD3 NP_443115.1
## 232 SELENOP1 NP_001026780.2
## 233 SELENOP2 NP_001335698.1
## 236 SELENOU NP_001180447.1
## 268 SELENOW1 NP_001291715.2
## 269 SELENOW2 NP_001341647.1
## 334 SELENOE NP_001182713.2
## 338 SELENOJ NP_001180398.1
## 340 SELENOL NP_001177311.1
## 343 SELENOO1 NP_001038336.2
## 344 SELENOO2 NP_001335014.1
## 348 SELENOT1 NP_840075.2
## 350 SELENOT2 NP_001091957.2
Make a dataframe of non-duplicated genes
seleno_df_noDups <- seleno_df_noNA[!i.dups, ]
dim(seleno_df_noDups)
## [1] 37 2
Let’s select 2 random sequences to work with. We’ll use sample() to select a random index number to get
First, lets make a vector that contains a unique number for each row of data
indices <- 1:nrow(seleno_df_noDups)
This would do the same thing
# with dim
indices <- 1:dim(seleno_df_noDups)[1]
# with length
indices <- 1:length(seleno_df_noDups$gene)
or hard-coded
indices <- 1:37
We can then use sample() to select 2 random numbers from this vector.
For x = we’ll use our vector of indices (1 to 37). For size we’ll use 2, since we want to pull out just 2 numbers. For replace we’ll use WHAT? since we don’t want to be ale to select the same number twice.
i.random.genes <- sample(x = indices,
size = 2,
replace = FALSE)
Hard coded this would be
i.random.genes <- sample(x = c(1:37),
size = 2,
replace = FALSE)
This gives me two indices values.
i.random.genes
## [1] 22 21
I can now use these index values to pull out two rows of data
seleno_df_noNA[i.random.genes, ]
## gene protein_refseq
## 29 GPX6 NP_874360.1
## 28 GPX4 NP_001034937.1
Hard coded, this would be something like this for whichever genes happen to have been selected
seleno_df_noNA[c(37,15), ]
## gene protein_refseq
## 47 SELENOS NP_060915.2
## 19 GPX1 NP_001316384.1
I will now download the two FASTA files.
rentrez::entrez_fetch(id = "NP_060915.2",
db = "protein",
rettype = "fasta")
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
rentrez::entrez_fetch(id = "NP_001316384.1",
db = "protein",
rettype = "fasta")
## [1] ">NP_001316384.1 glutathione peroxidase 1 isoform 5 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCE\nVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTI\nDIEPDIEALLSQGPSCA\n\n"
WHAT"S THIS DOING? saves this into vectors
prot1 <- rentrez::entrez_fetch(id = "NP_060915.2",
db = "protein",
rettype = "fasta")
prot2 <- rentrez::entrez_fetch(id = "NP_001316384.1",
db = "protein",
rettype = "fasta")
I can put them into a list like this
# make the list
seleno_thingy <- vector("list", 1)
# add the first fasta
seleno_thingy[[1]] <- prot1
# See the result
seleno_thingy
## [[1]]
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
# add the first fasta
seleno_thingy[[2]] <- prot2
# see the result
seleno_thingy
## [[1]]
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
##
## [[2]]
## [1] ">NP_001316384.1 glutathione peroxidase 1 isoform 5 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCE\nVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTI\nDIEPDIEALLSQGPSCA\n\n"
# WHAT DOES THIS DO? combines the 2 vectors
names(seleno_thingy) <- c("prot1", "prot2")
#Output
seleno_thingy
## $prot1
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
##
## $prot2
## [1] ">NP_001316384.1 glutathione peroxidase 1 isoform 5 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCE\nVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTI\nDIEPDIEALLSQGPSCA\n\n"
Elements of the list are accessed like this
seleno_thingy[[1]]
## [1] ">NP_060915.2 selenoprotein S isoform 1 [Homo sapiens]\nMERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAV\nEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDS\nPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG\n\n"
I’ll clean them with fasta_cleaner()
# first, make a copy of the list for storing the clean data
## I'm just going to copy over the old data
seleno_thingy_clean <- seleno_thingy
# HOW TO MAKE THIS MORE COMPACT? put in a specific value in place of length...
for(i in 1:length(seleno_thingy_clean)){
clean_fasta_temp <- compbio4all::fasta_cleaner(seleno_thingy[[i]],
parse = T)
seleno_thingy_clean[[i]] <- clean_fasta_temp
}
Now the data looks like this
seleno_thingy_clean
## $prot1
## [1] "M" "E" "R" "Q" "E" "E" "S" "L" "S" "A" "R" "P" "A" "L" "E" "T" "E" "G"
## [19] "L" "R" "F" "L" "H" "T" "T" "V" "G" "S" "L" "L" "A" "T" "Y" "G" "W" "Y"
## [37] "I" "V" "F" "S" "C" "I" "L" "L" "Y" "V" "V" "F" "Q" "K" "L" "S" "A" "R"
## [55] "L" "R" "A" "L" "R" "Q" "R" "Q" "L" "D" "R" "A" "A" "A" "A" "V" "E" "P"
## [73] "D" "V" "V" "V" "K" "R" "Q" "E" "A" "L" "A" "A" "A" "R" "L" "K" "M" "Q"
## [91] "E" "E" "L" "N" "A" "Q" "V" "E" "K" "H" "K" "E" "K" "L" "K" "Q" "L" "E"
## [109] "E" "E" "K" "R" "R" "Q" "K" "I" "E" "M" "W" "D" "S" "M" "Q" "E" "G" "K"
## [127] "S" "Y" "K" "G" "N" "A" "K" "K" "P" "Q" "E" "E" "D" "S" "P" "G" "P" "S"
## [145] "T" "S" "S" "V" "L" "K" "R" "K" "S" "D" "R" "K" "P" "L" "R" "G" "G" "G"
## [163] "Y" "N" "P" "L" "S" "G" "E" "G" "G" "G" "A" "C" "S" "W" "R" "P" "G" "R"
## [181] "R" "G" "P" "S" "S" "G" "G" "U" "G"
##
## $prot2
## [1] "M" "C" "A" "A" "R" "L" "A" "A" "A" "A" "A" "A" "A" "Q" "S" "V" "Y" "A"
## [19] "F" "S" "A" "R" "P" "L" "A" "G" "G" "E" "P" "V" "S" "L" "G" "S" "L" "R"
## [37] "G" "K" "E" "N" "A" "K" "N" "E" "E" "I" "L" "N" "S" "L" "K" "Y" "V" "R"
## [55] "P" "G" "G" "G" "F" "E" "P" "N" "F" "M" "L" "F" "E" "K" "C" "E" "V" "N"
## [73] "G" "A" "G" "A" "H" "P" "L" "F" "A" "F" "L" "R" "E" "A" "L" "P" "A" "P"
## [91] "S" "D" "D" "A" "T" "A" "L" "M" "T" "D" "P" "K" "L" "I" "T" "W" "S" "P"
## [109] "V" "C" "R" "N" "D" "V" "A" "W" "N" "F" "E" "K" "F" "L" "V" "G" "P" "D"
## [127] "G" "V" "P" "L" "R" "R" "Y" "S" "R" "R" "F" "Q" "T" "I" "D" "I" "E" "P"
## [145] "D" "I" "E" "A" "L" "L" "S" "Q" "G" "P" "S" "C" "A"
Each element of the list is a vector of character data
class(seleno_thingy_clean[[1]])
## [1] "character"
is(seleno_thingy_clean[[1]])
## [1] "character" "vector"
## [3] "data.frameRowLabels" "SuperClassMethod"
## [5] "character_OR_connection" "character_OR_NULL"
## [7] "atomic" "EnumerationValue"
## [9] "vector_OR_Vector" "vector_OR_factor"
is.vector(seleno_thingy_clean[[1]])
## [1] TRUE
For old-times sake we can make a dotplot.
Now for a dotplot
takes each vector in the list and makes it into a separate object
prot1_vector <- seleno_thingy_clean[[1]]
prot2_vector <- seleno_thingy_clean[[2]]
We can dotplot like this
seqinr::dotPlot(prot1_vector,
prot1_vector)
accesses things directly from the list like this
seqinr::dotPlot(seleno_thingy_clean[[1]],
seleno_thingy_clean[[2]])
dotPlot likes things in a single vector, but pairwiseAlignment like a single string of characters, so as always we have to process the data.
takes everything in the vector and squish it together "" is the default for read
prot1_str <- paste(seleno_thingy_clean[[1]],sep = "", collapse = "")
prot2_str <- paste(seleno_thingy_clean[[2]],sep = "", collapse = "")
So now things look like this A vector of length 1 with the entire sequence in a single element of the vector.
prot1_str
## [1] "MERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQRQLDRAAAAVEPDVVVKRQEALAAARLKMQEELNAQVEKHKEKLKQLEEEKRRQKIEMWDSMQEGKSYKGNAKKPQEEDSPGPSTSSVLKRKSDRKPLRGGGYNPLSGEGGGACSWRPGRRGPSSGGUG"
Protein alignments need a amino acid transition matrix, and we need to use data() to bring those up into active memory (VERY IMPORTANT STEP!)
data(BLOSUM50)
The alignment
align_out <- Biostrings::pairwiseAlignment(pattern = prot1_str,
subject = prot2_str,
type = "global",
gapOpening = -9.5,
gapExtension = -0.5)
The raw output just looks like this and isn’t very informative about how things look
align_out
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MERQEESLSARPALETEGLRFLHTTVGSLLATYG...-----------------ACSWRPGRRGPSSGGUG
## subject: M---------------------------------...IDIEPDIEALLSQGPSCA----------------
## score: -160.2561
The full alignment looks like this. It should have LOTS of gaps because these are two random sequences.
compbio4all::print_pairwise_alignment(align_out)
## [1] "MERQEESLSARPALETEGLRFLHTTVGSLLATYGWYIVFSCILLYVVFQKLSARLRALRQ 60"
## [1] "M---------------------------------------C----------AARL----- 6"
## [1] " "
## [1] "RQLDRAAAAVEPDVVVKRQEALAAA--------RLKMQEELNAQVEKHKEKLKQLEEEKR 112"
## [1] "-----AAAA-------------AAAQSVYAFSAR-------------------------- 22"
## [1] " "
## [1] "RQKIEMWDSMQEGKSYKGNAKKPQEEDSPGPSTSSVLKRKSDRKPLRGGGYNPLSGE--- 169"
## [1] "--------------------------------------------PLAGG-------EPVS 31"
## [1] " "
## [1] "------------------------GGG--------------------------------- 172"
## [1] "LGSLRGKENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPS 91"
## [1] " "
## [1] "------------------------------------------------------------ 172"
## [1] "DDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTIDIEPDIEALLS 151"
## [1] " "
## [1] "-----A 227"
## [1] "QGPSCA 211"
## [1] " "
These are two randomly chosen sequences, so the alignment should be pretty bad - we expect there to be just some random points where they match up.
The score is negative, but on its own that MEANS WHAT?
score(align_out)
## [1] -160.2561
pid gives us a more intutive sense of how well they match
pid(align_out)
## [1] 7.189542
Of course, pid can be calculated several ways using different denominators
pid(align_out,type = "PID1")
## [1] 7.189542
pid(align_out,type = "PID2")
## [1] 91.66667
pid(align_out,type = "PID3")
## [1] 14.01274
pid(align_out,type = "PID4")
## [1] 12.71676