title: “final” output: html_document date: “2023-11-18” —-
This is a tutorial on how to use R markdown for reproducible research.
Here, we can type long passages or descriptions of our data without the nedd of “hashing” out our comments with teh # symbol. In our first example, we will be using the ToothGrowth dataset. in this experiment, Guinea Pigs (literal) were given different amounts of vitamin c to see the effects on the animal’s tooth growth.
To run R code in a markdown file, we need to dentote the section that is considered R code. we call these “code chunks.”
Below is a code chunk:
Toothdata <- ToothGrowth
head(Toothdata)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
As you can see, from running the “play” button on the code chunk, the results are P[rinted inline of the r markdown file.]
Lets get the results
```r
#keggres = gage(foldchanges, gsets=kegg.ath.sigmet, same.dir = TRUE)
#greater <- keggres$greater
#lessers <- keggres$less
#rite.csv(greater, file = "BRIC_16_pathview_Greater_Pathways.csv")
#write.csv(lesser, file = "BRIC_16_pathview_Lesser_Pathways.csv")
lets map to pathways
#keggrespathways = data.frame(id=rownames(keggres$greater), keggres$greater) % > %
#tbl_df() %>%
#filter(row_number() <=5) %>%
#.$id %>%
#as.character()
#keggrespathways
#keggresids_greater = substr(keggrespathways, start=1, stop=8)
#keggresids_greater
#genedata <- as.character(all2$logFC)
Define a plotting function to apply next
#plot_pathway = function(pid) pathview(gene.data=foldchange, pathway.id = pid, species = "ath", new.signature = FALSE)
plot multiple pathways (plots are saved to disk and returns a throwaway aject list)
#tmp = sapply(keggresids_less, function(pid) pathview(gene.data=foldchanges, pathway.id = pid, species = "ath"))
#EDGE.R
library("edgeR")
library("dplyr")
library("AnnotationDbi")
library("org.Mm.eg.db")
##
#rawdata = read.csv("GLDS-102_rna_seq_Normalized_Counts.csv", row.names = "gene_id")
#group <- factor(c('1'rawdata = read.csv("GLDS-102_rna_seq_Normalized_Counts.csv", row.names = "gene_id")
#group <- factor(c('1', '1', '1','1','1','1', '2','2','2','2','2','2'))
#dgeGlm <- DGEList(counts = rawdata, group = group)
#str(dgeGlm)
#dgeGlm <- DGEList(counts = rawdata, group = group)
#str(dgeGlm)
#group <- factor(c('1', '1', '1','1','1','1', '2','2','2','2','2','2'))
#dgeGlm <- DEGList(counts = rawdata, group = group)
#str(dgeGlm)
#keep <- rowSums(cpm(dgeGlm)>2) >=4
#dgeGlm <- dgeGlm[keep,]
#names(dgeGlm)
#dgeGlm[["samples"]]
#design <- model.matrix(~group)
#design
#dgeGlmComDisp <- estimateGLMCommonDisp(dgeGlm, design, verbose = TRUE)
#dgeGlmTrendDisp <- estimateGLMTrendedDISP(dgeGlmComDisp, design)
#dgeGlmTagDisp <- estimateGLMTagwiseDisp(dgeGlmTrendDisp, design)
#plotBCV(dgeGlmTagDisp)
#fit <- glmFit(dgeGlmTagDisp, design)
#colnames(coef(fit))
#lrt <- glmLRT(fit, coef =2 )
#ttGlm <- topTags(lrt, n = Inf)
#class(ttGlm)
#summary(deGlm <- decideTestsDGE(lrt, p = 0.1, adjust = "fdr"))
#tagsGl <- rownames(dgeGlmTagDisp)[as.logical(deGlm)]
#plotSmear(lrt, de.tags = tagsGlm)
#hits2 <- ttGlm$table[ttGlm$table$FDR < 0.1, ]
#write.csv(hits2, "Mouse_EdgeR_Results_Zero_vs_1.csv")
#columns(org.Mm.eg.db)
#ttGlm2 <- as.data.frame(ttGlm$table)
#ttGlm2$symbol = mapIds(org.Mm.eg.db,
#Keys=row.names(ttGlm2),
#column = "SYMBOL",
#keytype = "ENSEMBLE",
#multiVals = "first")
#ttGlm2$sentrez + mapIds(org.Mn.eg.db
#keys=row.names(ttGlm2),
#column = "ENTREZID",
#keytype = "ENSEMBL",
#multiVals = "first")
#ttGlm2$name = mapIds(org.Mm.eg.db,
#keys=row.names(ttGlm2),
#column = "ENTREZID",
#keytype = "ENSEMBL",
#multiVals = "first")
#head(ttGlm2)
#library(pathview)
#library(gage)
#library(gageData)
#data(kegg.sets.mm)
#data(go.subs.mm)
#foldchanges <- ttGlm2$logFC
#names(foldchanges) <- ttGlm2$entrez
#kegg.mm = kegg.gsets("mouse", id.type = "entrez")
#kegg.mm.sigmet = kegg.mm$kg.sets[kegg.mm$sigmet.idx]
# Lets get the results
#keggres = gage(foldchanges, gsets=kegg.mm.sigmet, same.dir = TRUE)
#lapply(keggres, head)
#greaters < - keggres$greater
#lessers <- keggres$less
#keggrespathways = data.frame(id = rownames(keggres$greater), keggres$greater) %>%
#tbl_df() %>%
#filter(row_number()<=5) %>%
#.$id %>%
#as.character()
#keggrespathways
#keggresids = substr(keggrespathways, start=1, stop = 8)
#keggresids
#plot_pathway = function(pid) pathview(gene.data=foldchanges, #pathway.id =pid, species = "mouse", new.signature = FALSE)
#plot multiple pathways
#tmp = sapply(keggresids, function(pid) #pathview(gene.data=foldchanges, pathway.id = pid, species = "mouse"))
#keggrespathways = data.frame(id = rownames(keggres$less), #keggres$less) %>%
#tbl_df() %>%
#filter(row_number()<=5) %>%
#pathway.id =pid, species = "mouse", new.signature = FALSE)
# Plot multiple pathways
#tmp = sapply(keggresids, function(pid) #pathview(gene.data=foldchanges, pathway.id = pid, species = "mouse"))
#library(imager)
#filenames <- list.file(path = #"E:Bioinformatics/Bisc_450_Scripts/mouse_edgeR", pattern = ".*pathview.png")
#all_images <- lapply(filenames, load.image)
#knitr::include_graphics(filenames)
#Deseq
lets load the required libraries for this analysis
BiocManager::install("apeglm")
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
## CRAN: https://cloud.r-project.org
## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.0 (2023-04-21)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'apeglm'
## Installation paths not writeable, unable to update packages
## path: /usr/lib/R/library
## packages:
## boot, class, foreign, KernSmooth, lattice, MASS, Matrix, mgcv, nlme, nnet,
## rpart, spatial, survival
## Old packages: 'AnnotationDbi', 'BiocVersion', 'dplyr', 'GenomeInfoDb', 'Gviz',
## 'htmltools', 'lifecycle', 'Luminescence', 'maps', 'matrixStats', 'mclust',
## 'phytools', 'plotrix', 'RcppEigen', 'RCurl', 'rlang', 'RSQLite', 'shiny',
## 'SparseArray', 'stringi', 'stringr', 'VariantAnnotation', 'XML'
library("DESeq2")
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
library("dplyr")
library("apeglm")
lets load in our data
countData <- read.csv("GLDS-102_rna_seq_Normalized_Counts.csv")
colData <- read.csv("PHENO_DATA_mouse.csv", sep = ",", row.names = 1)
now we need to add leveling factors to the colData
colData$group <- factor(colData$group, levels = c("0", "1"))
now lets make sure we have the same number of individuals as well as groups
all(rownames(colData)) %in% colnames(countData)
## Warning in all(rownames(colData)): coercing argument of type 'character' to
## logical
## [1] FALSE
we need to round up the counts, because DESeq2 only allows integers as an input, and not fractional numbers. This depends on the method of alignment that was used upstream.
countData %>%
mutate_if(is.numeric, ceiling)
#countData[, 2:13] <- sapply(countData[,2:13], as.integer)
#row.names(countData) <- countData[,1]
#countData <- countData[2:13]
#rownames(colData) == colnames(countData)
#dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~group)
#dds <- dds[rowSums(counts(dds)>2)>=4]
#dds <- DESeq(dds)
#res <- results(dds)
#resLFC <- lfcShrink(dds, coef= 2)
#(resOrdered <- res[order(res$padj),])
#summary(res)
#FLT_Vs_GC <- as.data.frame(res$log2FoldChange)
#head(FLT_Vs_GC)
#plotMA(resLFC, ylim = c(-2,2))
#pdf(file = "MA_Plot_FLT_VS_GC.pdf")
#dev.off()
lets save our differential expression results to file.
#write.csv(as.data.frame(resOrdered), file = "Mouse_DESeq.csv")
lets perform a custom transformation
#dds <- estimateSizeFactors(dds)
#se <- SummarizedExperiment(log2(counts(dds, normalize = TRUE) +1), #colData = colData(dds))
#pdf(file = "PCA_PLOT_FLT_VS_GC.pdf")
#plotPCA(DESeqTransform(se), intgroup = "group")
lets load our requirements packages
#library(AnnotationDbi)
#library(org.Mm.eg.db)
#columns(org.Mm.eg.db)
#foldchanges <-as.data.frame(res$log2FoldChange, row.names = #row.names(res))
#head(foldchanges)
#res$symbol = mapIds(org.Mm.eg.db,
#keys = row.names(res),
#column = "SYMBOL",
#keytype = "ENSEMBL",
#multiVals = "first")
#res$entrez = mapIds(org.Mm.eg.db,
#keys = row.names(res),
#column = "ENTREZID",
#keytype = "ENSEMBL",
#multiVals = "first")
#res$genename = mapIds(org.Mm.eg.db,
#keys = row.names(res),
#column = "GENENAME",
#keytype = "ENSEMBL",
#multiVals = "first")
#head(res, 10)
Lets load our pathview packages
library(pathview)
##
## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
##
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################
library(gage)
##
library(gageData)
data(kegg.sets.mm)
data(go.subs.mm)
#foldchanges <- res$log2FoldChange
#names(foldchanges)= res$entrez
#head(foldchanges)
#kegg.mm <- kegg.gsets("mouse", id.type = "entrez")
#kegg.mm.sigmet <- kegg.mm$kg.sets[kegg.mm$sigmet.idx]
lets map the results
#keggres <- gage(foldchanges, gsets = kegg.mm.sigmet, same.dir = TRUE)
#lapply(keggres, head)
lets save our greater and less than pathways
#greaters <- keggres$greater
#lessers <- keggres$less
#keggrespathways <- data.frame(id = rownames(keggres$greater), #keggres$greater) %>%
#tbl_df() %>%
#filter(row_number() <= 3) %>%
#.$id %>%
#as.character()
#keggrespathways
#keggresids <- substr(keggrespathways, start = 1, stop = 8)
#keggresids
PLOT!
plot_pathway = function(pid) pathview(gene.data = foldchange, pathway.id = pid, species = "mouse", new.signature = FALSE)
#tmp = sapply(keggresids, function(pid) pathview(gene.data = #foldchanges, pathway.id = pid, species = "mouse"))
#keggrespathways <- data.frame(id = rownames(greaters), #keggres$greater) %>%
#tbl_df() %>%
#filter(row_number() <= 3) %>%
#.$id %>%
#as.character()
#keggrespathways
#keggresids <- substr(keggrespathways, start = 1, stop = 8)
#keggresids
PLOT!
plot_pathway = function(pid) pathview(gene.data = foldchange, pathway.id = pid, species = "mouse", new.signature = FALSE)
#tmp = sapply(keggresids, function(pid) pathview(gene.data = #foldchanges, pathway.id = pid, species = "mouse"))
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
##
## subtract
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:SummarizedExperiment':
##
## resize, width
## The following objects are masked from 'package:GenomicRanges':
##
## resize, width
## The following object is masked from 'package:Biostrings':
##
## width
## The following object is masked from 'package:XVector':
##
## width
## The following objects are masked from 'package:oligoClasses':
##
## B, B<-
## The following objects are masked from 'package:IRanges':
##
## resize, width
## The following object is masked from 'package:S4Vectors':
##
## width
## The following object is masked from 'package:Biobase':
##
## channel
## The following object is masked from 'package:BiocGenerics':
##
## width
## The following object is masked from 'package:stringr':
##
## boundary
## The following object is masked from 'package:dplyr':
##
## where
## The following object is masked from 'package:tidyr':
##
## fill
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
filenames <- list.files(path = "E:/Bioinformatics/Bisc_450_Scripts/mouse_DEseq/bkb030_email_latech_edu", pattern = ".*pathview.png")
all_images <- lapply(filenames, load.image)
knitr::include_graphics(filenames)
#EdgeR vs. DESeq
library(tidyverse)
#EdgeR <- read.csv("Mouse_EdgeR_Results_Zero_vs_1.csv")
#DESeq <- read.csv("Mouse_DESeq.csv")
#DESeq2 <- DESeq %>%
#filter(padj < 0.1)
#DESeq2 <- DESeq2[,c(1,3)]
#EdgeR <- EdgeR[,1:2]
#colnames(DESeq2) <- c("ID", "logFC")
#colnames(EdgeR) <- c("ID", "logFC")
#install.packages("GOplot")
#library(GOplot)
#comp <- GOVenn(DESeq2, EdgeR, label = c("DeSeq1", "EdgeR"), title = #"Comparison of DESeq and EdgeR DE Genes", plot = FALSE)
#comp$plot
#Multiple Protein Alignment
library(msa)
#seq <- readAAStringSet(file.path(getwd(), "datasets", "ch3", #"hglobin.fa"))
seq <- readAAStringSet(“hglbibin.fa”)
seq
## function (...)
## UseMethod("seq")
## <bytecode: 0x55c703b9ebc0>
## <environment: namespace:base>
Lets align the 8 different amino acid sequences
#alignments <- msa(seq, method = "ClustalW")
#alignments
#msaPrettyPrint(alignments, output = "pdf", showNames = "left",
#showLogo = "none", askForOverwrite = "FALSE",
#verbose = TRUE, file = "whole_aligh.pdf")
#msaPrettyPrint(alignments, c(10,30), output = "pdf", showNames = #"left",
#file = "Zoomed_align.pdf", showLogo = "top", #askForOverwrite = FALSE,
#verbose = TRUE)
Lets make a tree from our alignment
library(ape)
##
## Attaching package: 'ape'
## The following object is masked from 'package:imager':
##
## where
## The following object is masked from 'package:Biostrings':
##
## complement
## The following object is masked from 'package:dplyr':
##
## where
library(seqinr)
##
## Attaching package: 'seqinr'
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:Biostrings':
##
## translate
## The following object is masked from 'package:limma':
##
## zscore
## The following object is masked from 'package:dplyr':
##
## count
Convert to seqinr alignment -> get the distances and make a tree
#alignment_seqinr <- alignment_seqinr <-msaConvert(alignments, type = #"seqinr::alignment")
#distances1 <- seqinr::dis.alignment(alignment_seqinr, "identify")
#tree <- ape::nj(distances1)
#plot(tree, main = "Phylogenetic Tree of HBA Sequences")
#Synteny
library(DECIPHER)
## Loading required package: RSQLite
## Loading required package: parallel
In the first step, we load the libraries and the sequence into long_seqs This is a DNAStringSet object ~Desktop/classroom/myfiles
#long_seq <- readRNAStringSet(file.path(getwd(), "datasets", "ch3", #"plastid_genomes.fa"))
#long_seq
Now lets build a temporary SQLite database
#seqs2DB(long_seq, "XStringSet", "long_db", names(long_seq))
Now that we’ve built the database, we can do the following: Find the syntenic blocks
#synteny <- FindSynteny("long_db")
View blocks with plotting
#pairs(synteny)
#plot(synteny)
Make an actual alignment file
#alignment <- AlignSynteny(synteny, "long_db")
Lets create a structure holding all aligned syntenic blocks for a pair of sequences
#block <- unlist(alignment[[1]])
we can write to file one align
#writeXStringSet(block, "genome_blocks_out.fa")
#library(DECIPHER)
#Unannotated Regions
library(locfit)
## locfit 1.5-9.8 2023-06-11
##
## Attaching package: 'locfit'
## The following object is masked from 'package:purrr':
##
## none
library(Rsamtools)
Lets creat a function that will load the gene region information in a GFF file and convert it to a bioconductor GRanges object
get_annotated_regions_from_gff <- function(file_name) {
gff <- rtracklayer::inport.gff(file_name)
as(gff, "GRanges")
}
get count in windows across the genome in 500bp segments
#whole_genome <- csaw::windowCounts(
file.path(~Desktop/classroom/myfile)
file.path(getwd(), "datasets", "ch1", "Windows.bam")
## [1] "/home/student/Desktop/classroom/myfiles/datasets/ch1/Windows.bam"
#bin = TRUE,
#filter = 0,
#width = 500
#param = csaw::readParam(
#minq = 20, # determines what is a passing read
#dedup = TRUE, # removes pcr duplicate
#pe = "both" # requires that both pairs of reads are aligned
Since this is a single column of data, let’s rename it
#colnames(whole_genome) <- c("small_data")
#annotated_regions <- #get_annotated_regions_from_gff(file.path(getwd)(), "datasets", "ch1", #"genes.gff"))
Now that we have the windows of high expression, we want to see if any of them overlap with annotated regions.
library(IRanges)
library(SummarizedExperiment)
Find the overlaps between the windows we made
#windows_in_genes <- IRanges::overlapsAny(
#SummarizedExperiment::rowRanges(whole_genome), # creates a Granges #object
#annotated_regions
#)
#windows_in_genes
Here we subset the whole_genome object into annotated and nonannotated regions
#annotated_window_counts <- whole_genome[windows_in_genes,]
#nonannotated_window_counts <- whole_genome[!windows_in_genes,]
Use assay () to extract the actual counts from the Granges object
#assay(non_annotated_window_counts)
In this step, we us Rsamtools Puleup() function to get a per-base coverage dataframe. Each row represents a single nucleotide in the reference count and the count column gives the depth of coverage at that point
#library(bumphunter)
#pile_df <- Rsamtools::pileup(file.path(getwd(), "datasets", "ch1", #"windows.bam"))
This step groups the reads with certain distance of each other into a cluster. We give the sequence names, position and distance.
#cluster <- bumphunter::clusterMaker(pile_df$seqnames, pile_df$pos, #maxGap = 100)
#table(cluster)
In this step, we will map the reads to the regions we found for the genome
#bumphunter::regionFinder(pile_df$count, pile_df$seqnames, #pile_df$pos, clusters, cutoff = 1)
#Tree.io
Lets load the required packages
library(ggplot2)
library(ggtree)
## ggtree v3.10.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
##
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
##
## Attaching package: 'ggtree'
## The following object is masked from 'package:ape':
##
## rotate
## The following object is masked from 'package:magrittr':
##
## inset
## The following object is masked from 'package:reshape':
##
## expand
## The following object is masked from 'package:Biostrings':
##
## collapse
## The following object is masked from 'package:IRanges':
##
## collapse
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:tidyr':
##
## expand
library(treeio)
## treeio v1.26.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
##
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
##
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
##
## Attaching package: 'treeio'
## The following object is masked from 'package:seqinr':
##
## read.fasta
## The following object is masked from 'package:Biostrings':
##
## mask
First we need to load our raw tree data. Its a newick format so we use:
itol <- ape::read.tree("itol.nwk")
Now we will print out a very basic phylogenetic tree
ggtree(itol)
We can also change the format to make it a circular tree
ggtree(itol, layout = "circular")
We can also change the left-right/ up-down direction
#ggtee(itol) + cord_flip() + scale_x_reverse()
By using geom_tipla() we can add names to the end of tips
#ggtree(itol) + genom_tiplab(color = "indianred", size = 1.5)
By adding a geom_strip() layer we can annotate clades in the tree with a block of color
ggtree(itol, layour = "unrooted") + geom_strip(13,14, color = "red", barsize = 1)
## Warning in stat_tree(data = data, mapping = mapping, geom = "segment", position = position, : Ignoring unknown parameters: `layour`
## Ignoring unknown parameters: `layour`
We can highlight clades in unrooted trees with blobs of color using geom_hilight
ggtree(itol, layout = "unrooted") + geom_hilight(node = 11, type = "encircle", fill = "steelblue")
## "daylight" method was used as default layout for unrooted tree.
## Average angle change [1] 0.174910612627282
## Average angle change [2] 0.161645191380673
## Average angle change [3] 0.129304375923319
## Average angle change [4] 0.0825706767962636
## Average angle change [5] 0.100056259084497
install.packages('BAMMtools')
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(BAMMtools)
We can use the MRCA (most recent common ancestor ) functions to find the node we want
getmrca(itol, "Photorhabdus_luminescens", "Blochmannia_floridanus")
## [1] 206
Now if we want to highlight the section of the most recent common ancester between the two above
ggtree(itol, layout = "unrooted") + geom_hilight(node = 206 , type = "encircle", fill = "steelblue")
## "daylight" method was used as default layout for unrooted tree.
## Average angle change [1] 0.174910612627282
## Average angle change [2] 0.161645191380673
## Average angle change [3] 0.129304375923319
## Average angle change [4] 0.0825706767962636
## Average angle change [5] 0.100056259084497
#Treespace
Quantifying difference between trees with treespace
First lets load the required packages
library(ape)
library(adegraphics)
## Registered S3 methods overwritten by 'adegraphics':
## method from
## biplot.dudi ade4
## kplot.foucart ade4
## kplot.mcoa ade4
## kplot.mfa ade4
## kplot.pta ade4
## kplot.sepan ade4
## kplot.statis ade4
## scatter.coa ade4
## scatter.dudi ade4
## scatter.nipals ade4
## scatter.pco ade4
## score.acm ade4
## score.mix ade4
## score.pca ade4
## screeplot.dudi ade4
##
## Attaching package: 'adegraphics'
## The following object is masked from 'package:ape':
##
## zoom
## The following object is masked from 'package:GenomicRanges':
##
## score
## The following object is masked from 'package:Biostrings':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
library(treespace)
## Loading required package: ade4
##
## Attaching package: 'ade4'
## The following objects are masked from 'package:adegraphics':
##
## kplotsepan.coa, s.arrow, s.class, s.corcircle, s.distri, s.image,
## s.label, s.logo, s.match, s.traject, s.value, table.value,
## triangle.class
## The following object is masked from 'package:GenomicRanges':
##
## score
## The following object is masked from 'package:Biostrings':
##
## score
## The following object is masked from 'package:BiocGenerics':
##
## score
Fow we need to load all the tree files into a multi Phylo object
#treefiles <- list.files9file.path(getwd(),"gene_trees") full.names = #TRUE
#tree_list <- lapply(treefiles, read.tree)
#class(tree_list)
#class(tree-list) <- "multiphylo"
#class(tree_list)
Now we can compute the kendall-coljin distance between trees. This function does a LOT of analysis.
First it runs a pairwise comparison of all trees in the input
Second it carries out clustering using PCA
These results are returned in a list of objects,where $D contains the pairwise
Metric of the trees, and $pco contains the PCA. The method we use (Kendal-Coljin) is particularly suitable for rooted trees as we have here. The option NF tells us how many principal components to retain.
#comparison <- treespace(tree_list, nf = 3)
We can plot the pairwise distance between trees with table.image
#table.image(comparisons$D, nclass= 25)
Now lets print the PCA and clusters, this shows us how the group of trees cluster
#plotGroves(comparison$pco, lab.show = TRUE, lab.cex = 1.5)
#groves <- findGroves(comparisons, nclust = 2)
#plotGroves(groves)
#Binding Trees
Extracting and working with subtrees using APE
load our required package
library(ape)
Now lets load the tree data we will be working with
newick <- read.tree("mammal_tree.nwk")
#1 <- subtrees(newick)
Lets plot the tree to see what it looks like
plot(newick)
We can subset this plot using the “node” function
#plot(1[[4]], sub = "Node 4")
Extract the tree manually
small_tree <- extract.clade(newick, 9)
plot(small_tree)
Now what if we want to bind two trees together.
new_tree <- bind.tree(newick, small_tree, 4)
plot(new_tree)
#Trees From Alignment
Reconstructing trees from alignment
Lets load the packages
library(Biostrings)
library(msa)
library(phangorn)
First we’ll load the sequences into a seqs variable
seqs <- readAAStringSet("abc.fa")
Now, lets construct an aliognment with the msa package and ClustalOmega
aln <- msa::msa(seqs, method = c("ClustalOmega"))
## using Gonnet
To create a tree, we need to convert the alignment to a phyData objects
aln <- as.phyDat(aln, type = "AA")
class(aln)
## [1] "phyDat"
In this step, we/ll actually make the trees. Tres are made from a distance matrix, which can be computed with dist.ml() - ML stands for maximum liklihood
dist_mat <- dist.ml(aln)
Now we pass the distance matrix to upgma to make a UPGMA tree.
upgma_tree <- upgma(dist_mat)
plot(upgma_tree, main = "UPGMA")
We can conversley pass the distance matrix to a neighbor joining function
#nj_tree <- NJ(dist-mat)
#plot(nj_tree, main = "NJ")
Lastly, we are oging to use the bootstraps.phyDat() function to compute bootstrap support for the branches of the tree. The first argument is the object (aln), while the second argument in the ufunction ng
Bootstraps are e3ssentially confidence intervals for how the clade is placed in the correct position.
#fit <- plm(nj_tree, aln)
#bootstraps <- bootstrap.phyDat(aln, FUN = function(x) {NJ(dist.ml)}, #bs = 100)
#plotBS(nj_tree, bootstraps, p = 10 )
#Identifying Variants
First lets load the required libraries
library(GenomicRanges)
library(rtracklayer)
library(VariantAnnotation)
##
## Attaching package: 'VariantAnnotation'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:base':
##
## tabulate
#library(gmapR)
library(VariantTools)
Now we want to load our datasets. We need . Bam and . fa files for this pipeline to work
bam_file <- file.path(getwd(), "hg17_snps.bam")
fasta_file <- file.path(getwd(), "chr17.83k.fa")
Now we need to set up the genome object and the paramters object:
fa <- rtracklayer::FastaFile(fasta_file)
Now we create a GMapGenome object, which describes the genome to the later variant calling function
#genome <- gmapR:: GmapGenome(fa, create = TRUE)
This next step sets our parameter for what is considered a variant. There can be a lot of fine-tuning to this function. We are just going to use the standard setting
#qual_params <- TallyVariantsParam(
#genome = genome,
#minimum_mapq = 20)
#var_params <- VariantCallingFilter(read.count = 19, p.lower = 0.01)
Now we use CallVariants function in accordance with our paramters we defined above
#called_variants <- callVariants(bam_file,
#qual_params,
#calling.filter = var_params)
#head(called_variants)
we have identified 6 variants
Now, we move on to annotation and load the feature position information from gff
get_annotated_regions_from_gff <- function(file_name) {
gff <- rtracklayer::import.gff(file_name)
as(gff, "GRanges")
}
Note you can also load this data from a bed file
genes <- get_annotated_regions_from_gff("chr17.83k.gff3")
Now we can calculate which variants overlap which genes
#overlaps <- GenomicRanges::findOverlaps(called_variants, genes)
#overlaps
and lastly we subset the genes with the list of overlaps
#genes [subjectHits(overlaps)]
#Open Reading Frames
First thing, lets load the required packages
library(Biostrings)
library(systemPipeR)
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: GenomicAlignments
##
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
##
## last
##
## Attaching package: 'ShortRead'
## The following object is masked from 'package:adegraphics':
##
## zoom
## The following objects are masked from 'package:locfit':
##
## left, right
## The following object is masked from 'package:ape':
##
## zoom
## The following object is masked from 'package:imager':
##
## clean
## The following object is masked from 'package:magrittr':
##
## functions
## The following object is masked from 'package:oligo':
##
## intensity
## The following objects are masked from 'package:oligoClasses':
##
## chromosome, position
## The following object is masked from 'package:affy':
##
## intensity
## The following object is masked from 'package:dplyr':
##
## id
## The following object is masked from 'package:purrr':
##
## compose
## The following object is masked from 'package:tibble':
##
## view
##
## Attaching package: 'systemPipeR'
## The following object is masked from 'package:VariantAnnotation':
##
## reference
## The following object is masked from 'package:DESeq2':
##
## results
Lets load the data into a DNAstrings object, in this case, an Arabiopsis chloroplast # genome
dna_object <- readDNAStringSet("arabidopsis_chloroplast.fa")
Now lets predict the open reading frames which predORF(), we’ll predict all ORF on both strands.
#predict_orfs <- predORF(dna_objects, n = 'all', type = 'gr', mode = #'ORF', strand = 'both',
#longest_disjoint = TRUE)
#predict_orfs
This printed out a GRanges object in return, which 2,501 open reading frames. This is FAR too many open reading frames.
to filter out erronous ORFs, we do a simulation. We first estimate the lengthen ORF can reach by chance. We will create a string of random nucleotides that is the length of our chloroplast genome and determine the longer ORF that can arise by chance.
bases <- c("A", "T", "G", "C")
raw_seq_string <- strsplit(as.character(dna_object), "")
Now we need to ensure that our random nucleotides match the proportion of nucleotides in our chloroplast genome so we have no bias.
seq_length <- width(dna_object[1])
#counts <- lapply(bases, function(x) {sum(grep1(x, raw_seq_string))})
# <- unlist(lapply(counts, #function(base_count){signif(base_count/seq_length, 2)}))
#probs
Now we can build our function to simulate a genome
#get_longest_orf_in_random_genome <- fnction(x,
#length = 1000,
#probs = c(0.25, 0.25, #0.25, 0.25),
#bases = c("A", "T", "G", #"C")){
# Here we create our random genome and allow replaement for the next iteration
# random_genome <- paste0(sample(bases, size = length, replace = TRUE, prob = probs), collapse = "")
#random_dna_objects <- DNAStringsSet(random_genome)
#names(random_dna_object) <- c("random_dna_string")
#orfs <- predORF(random_dna_objects, n =1, type = 'gr', mode = #'ORF', strand = 'both', longest_disjoint = TRUE)
#return(max(width(orfs)))
#}
Now we use the function we just created to predict the ORFs in 10 random genomes
#random_lengths <- unlist(lapply(1:10, #get_longest_orf_in_random_genome, length = seq_length, probs = probs, #bases = bases))
Lets pull out the longest length from. our 10 simulations
#longest_random_orf <- max(random_lengths)
Lets only keep the frames that are longer in our chloroplast genome
#keep <- width(predict_orfs) > longest_random_orf
#orfs_to_keep <- predict_orfs [keep]
#orfs_to_keep
Write this data to file.
#extracted_orfs <- BSgenome::getSeq(dna_object, orfs_to_keep)
#names(extracted_orfs) <- paste0("orf_", 1:length(orfs_to_keep))
#writeXStringSet(extracted_orfs, "saved_orfs.fa")
#Karyoplot
First lets load the required packages
library(karyoploteR)
## Loading required package: regioneR
library(GenomicRanges)
Now we need to set up the genome object for our karyotype
#genome_df <- data.frame(
first we dictate the number of chromosomes
#chr = paste0 ("chr", 1:5),
#start = rep (1, 5),
and then we will dictate the length of each chromosome
end = c (34964571, 22037565, 25499034, 20862711, 31270811)
#)
Now we convert the data frame to a granges object
#genome_gr <- makeGRangesFromDataFrame(genome_df)
Now lets create some snp positions to map out. We do this by using the sample() function
snp_pos <- sample(1:1e7, 25)
#snps <- data.frame(
#chr = paste0("chr", smaple(1:5,25, replace = TRUE)),
#start = snp_pos,
#end = snp_pos
#)
again we convert the dataframe to granges
#snps_gr <- makeGRangesFromDataFrame (snps)
Now lets create some snp labels
#snp_labels <- paste0("snp_", 1:25)
Here we will set the margins for our plot
plot.params <- getDefaultPlotParams(plot.type =1)
Here we will set the margins of our plot
plot.params$data1outmargin <- 600
Now lets plot our snps
#kp <- plotKaryotype(genome= genome_gr, plot.type = 1, plot.params = plot.params)
#kpPlotmarkers(kp, snps_gr, labels = snap_labels)
We can also add some numeric data to our plots. We will generate 100 random numbers that plot to 100 windows on chromosome 5
#numeric_data <- data.frame(
#y = rnorm(100, mean. = 1, sd = 0.5),
#chr = rep("chr4", 100),
#start = seq(1, 20862711, 20862711/100),
#end = seq(1,20682711, 20862711/100)
#)
Now lets make the data a granges object
#numeric_data_gr <- makeGRangesFromDataFrame(numeric_data)
Again lets set our plot parameters
plot.params <- getDefaultPlotParams(plot.type = 2)
plot.params$data1outmargin <- 800
plot.params$data2outmargin <- 800
plot.params$topmargin <- 800
Lets plot the data
#kp <- plotKaryotype(genome= genome_gr, plot.type = 2, plot.params = plot.params)
#kpPlotmarkers(kp, snps_gr, labels = snp_labels)
#kpLines(kp, numeric_data_gr, y = numeric_data$y, data.panel = 2)
I had to use hashtags for the code that would not run.