Edge.R

title: “final” output: html_document date: “2023-11-18” —-

This is a tutorial on how to use R markdown for reproducible research.

Here, we can type long passages or descriptions of our data without the nedd of “hashing” out our comments with teh # symbol. In our first example, we will be using the ToothGrowth dataset. in this experiment, Guinea Pigs (literal) were given different amounts of vitamin c to see the effects on the animal’s tooth growth.

To run R code in a markdown file, we need to dentote the section that is considered R code. we call these “code chunks.”

Below is a code chunk:

Toothdata <- ToothGrowth

head(Toothdata)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5

As you can see, from running the “play” button on the code chunk, the results are P[rinted inline of the r markdown file.]


Lets get the results 


```r
#keggres = gage(foldchanges, gsets=kegg.ath.sigmet, same.dir = TRUE)

#greater <- keggres$greater
#lessers <- keggres$less

#rite.csv(greater, file = "BRIC_16_pathview_Greater_Pathways.csv")
#write.csv(lesser, file = "BRIC_16_pathview_Lesser_Pathways.csv")

lets map to pathways

#keggrespathways = data.frame(id=rownames(keggres$greater), keggres$greater) % > %
  #tbl_df() %>%
  #filter(row_number() <=5) %>%
  #.$id %>%
  #as.character()
#keggrespathways

#keggresids_greater = substr(keggrespathways, start=1, stop=8)
#keggresids_greater

#genedata <- as.character(all2$logFC)

Define a plotting function to apply next

#plot_pathway = function(pid) pathview(gene.data=foldchange, pathway.id = pid, species = "ath", new.signature = FALSE)

plot multiple pathways (plots are saved to disk and returns a throwaway aject list)

#tmp = sapply(keggresids_less, function(pid) pathview(gene.data=foldchanges, pathway.id = pid, species = "ath"))

#EDGE.R

library("edgeR")
library("dplyr")
library("AnnotationDbi")
library("org.Mm.eg.db")

##

#rawdata = read.csv("GLDS-102_rna_seq_Normalized_Counts.csv", row.names = "gene_id")

#group <- factor(c('1'rawdata = read.csv("GLDS-102_rna_seq_Normalized_Counts.csv", row.names = "gene_id")

#group <- factor(c('1', '1', '1','1','1','1', '2','2','2','2','2','2'))

#dgeGlm <- DGEList(counts = rawdata, group = group)
#str(dgeGlm)

#dgeGlm <- DGEList(counts = rawdata, group = group)
#str(dgeGlm)
#group <- factor(c('1', '1', '1','1','1','1', '2','2','2','2','2','2'))

#dgeGlm <- DEGList(counts = rawdata, group = group)
#str(dgeGlm)

#keep <- rowSums(cpm(dgeGlm)>2) >=4

#dgeGlm <- dgeGlm[keep,]

#names(dgeGlm)

#dgeGlm[["samples"]]

#design <- model.matrix(~group)
#design

#dgeGlmComDisp <- estimateGLMCommonDisp(dgeGlm, design, verbose = TRUE)

#dgeGlmTrendDisp <- estimateGLMTrendedDISP(dgeGlmComDisp, design)

#dgeGlmTagDisp <- estimateGLMTagwiseDisp(dgeGlmTrendDisp, design)

#plotBCV(dgeGlmTagDisp)

#fit <- glmFit(dgeGlmTagDisp, design)

#colnames(coef(fit))

#lrt <- glmLRT(fit, coef =2 )

#ttGlm <- topTags(lrt, n = Inf)

#class(ttGlm)

#summary(deGlm <- decideTestsDGE(lrt, p = 0.1, adjust = "fdr"))

#tagsGl <- rownames(dgeGlmTagDisp)[as.logical(deGlm)]

#plotSmear(lrt, de.tags = tagsGlm)

#hits2 <- ttGlm$table[ttGlm$table$FDR < 0.1, ]

#write.csv(hits2, "Mouse_EdgeR_Results_Zero_vs_1.csv")

#columns(org.Mm.eg.db)

#ttGlm2 <- as.data.frame(ttGlm$table)

#ttGlm2$symbol = mapIds(org.Mm.eg.db,
                       #Keys=row.names(ttGlm2),
                       #column = "SYMBOL",
                       #keytype = "ENSEMBLE",
                       #multiVals = "first")
#ttGlm2$sentrez + mapIds(org.Mn.eg.db
                       #keys=row.names(ttGlm2),
                       #column = "ENTREZID",
                       #keytype = "ENSEMBL",
                       #multiVals = "first")
#ttGlm2$name = mapIds(org.Mm.eg.db,
                       #keys=row.names(ttGlm2),
                       #column = "ENTREZID",
                       #keytype = "ENSEMBL",
                       #multiVals = "first")
#head(ttGlm2)

#library(pathview)
#library(gage)
#library(gageData)
#data(kegg.sets.mm)
#data(go.subs.mm)

#foldchanges <- ttGlm2$logFC
#names(foldchanges) <- ttGlm2$entrez
#kegg.mm = kegg.gsets("mouse", id.type = "entrez")
#kegg.mm.sigmet = kegg.mm$kg.sets[kegg.mm$sigmet.idx]

# Lets get the results
#keggres = gage(foldchanges, gsets=kegg.mm.sigmet, same.dir = TRUE)

#lapply(keggres, head)

#greaters < - keggres$greater
#lessers <- keggres$less

#keggrespathways = data.frame(id = rownames(keggres$greater), keggres$greater) %>%
  #tbl_df() %>%
  #filter(row_number()<=5) %>%
  #.$id %>%
  #as.character()
#keggrespathways

#keggresids = substr(keggrespathways, start=1, stop = 8)
#keggresids

#plot_pathway = function(pid) pathview(gene.data=foldchanges, #pathway.id =pid, species = "mouse", new.signature = FALSE)

#plot multiple pathways
#tmp = sapply(keggresids, function(pid) #pathview(gene.data=foldchanges, pathway.id = pid, species = "mouse"))

#keggrespathways = data.frame(id = rownames(keggres$less), #keggres$less) %>%
  #tbl_df() %>%
  #filter(row_number()<=5) %>%
  #pathway.id =pid, species = "mouse", new.signature = FALSE)

# Plot multiple pathways
#tmp = sapply(keggresids, function(pid) #pathview(gene.data=foldchanges, pathway.id = pid, species = "mouse"))

#library(imager)
#filenames <- list.file(path = #"E:Bioinformatics/Bisc_450_Scripts/mouse_edgeR", pattern = ".*pathview.png")

#all_images <- lapply(filenames, load.image)

#knitr::include_graphics(filenames)

#Deseq

lets load the required libraries for this analysis

BiocManager::install("apeglm")

## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: https://cloud.r-project.org

## Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.0 (2023-04-21)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'apeglm'

## Installation paths not writeable, unable to update packages
##   path: /usr/lib/R/library
##   packages:
##     boot, class, foreign, KernSmooth, lattice, MASS, Matrix, mgcv, nlme, nnet,
##     rpart, spatial, survival

## Old packages: 'AnnotationDbi', 'BiocVersion', 'dplyr', 'GenomeInfoDb', 'Gviz',
##   'htmltools', 'lifecycle', 'Luminescence', 'maps', 'matrixStats', 'mclust',
##   'phytools', 'plotrix', 'RcppEigen', 'RCurl', 'rlang', 'RSQLite', 'shiny',
##   'SparseArray', 'stringi', 'stringr', 'VariantAnnotation', 'XML'

library("DESeq2")

## Loading required package: GenomicRanges

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## The following object is masked from 'package:dplyr':
## 
##     count

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## The following object is masked from 'package:Biobase':
## 
##     rowMedians

library("dplyr")
library("apeglm")

lets load in our data

countData <- read.csv("GLDS-102_rna_seq_Normalized_Counts.csv")

colData <- read.csv("PHENO_DATA_mouse.csv", sep = ",", row.names = 1)

now we need to add leveling factors to the colData

colData$group <- factor(colData$group, levels = c("0", "1"))

now lets make sure we have the same number of individuals as well as groups

all(rownames(colData)) %in% colnames(countData)

## Warning in all(rownames(colData)): coercing argument of type 'character' to
## logical

## [1] FALSE

we need to round up the counts, because DESeq2 only allows integers as an input, and not fractional numbers. This depends on the method of alignment that was used upstream.

countData %>%
  mutate_if(is.numeric, ceiling)

#countData[, 2:13] <- sapply(countData[,2:13], as.integer)

#row.names(countData) <- countData[,1]

#countData <- countData[2:13]

#rownames(colData) == colnames(countData)

#dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~group)

#dds <- dds[rowSums(counts(dds)>2)>=4]

#dds <- DESeq(dds)

#res <- results(dds)

#resLFC <- lfcShrink(dds, coef= 2)

#(resOrdered <- res[order(res$padj),])

#summary(res)

#FLT_Vs_GC <- as.data.frame(res$log2FoldChange)
#head(FLT_Vs_GC)

#plotMA(resLFC, ylim = c(-2,2))

#pdf(file = "MA_Plot_FLT_VS_GC.pdf")

#dev.off()

lets save our differential expression results to file.

#write.csv(as.data.frame(resOrdered), file = "Mouse_DESeq.csv")

lets perform a custom transformation

#dds <- estimateSizeFactors(dds)

#se <- SummarizedExperiment(log2(counts(dds, normalize = TRUE) +1), #colData = colData(dds))

#pdf(file = "PCA_PLOT_FLT_VS_GC.pdf")

#plotPCA(DESeqTransform(se), intgroup = "group")

lets load our requirements packages

#library(AnnotationDbi)
#library(org.Mm.eg.db)

#columns(org.Mm.eg.db)

#foldchanges <-as.data.frame(res$log2FoldChange, row.names = #row.names(res))
#head(foldchanges)

#res$symbol = mapIds(org.Mm.eg.db,
                    #keys = row.names(res),
                    #column = "SYMBOL",
                    #keytype = "ENSEMBL", 
                    #multiVals = "first")
#res$entrez = mapIds(org.Mm.eg.db,
                    #keys = row.names(res),
                    #column = "ENTREZID",
                    #keytype = "ENSEMBL", 
                    #multiVals = "first")
#res$genename = mapIds(org.Mm.eg.db,
                    #keys = row.names(res),
                    #column = "GENENAME",
                    #keytype = "ENSEMBL", 
                    #multiVals = "first")

#head(res, 10)

Lets load our pathview packages

library(pathview)

##

## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
## 
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################

library(gage)

##

library(gageData)
data(kegg.sets.mm)
data(go.subs.mm)

#foldchanges <- res$log2FoldChange
#names(foldchanges)= res$entrez

#head(foldchanges)

#kegg.mm <- kegg.gsets("mouse", id.type = "entrez")
#kegg.mm.sigmet <- kegg.mm$kg.sets[kegg.mm$sigmet.idx]

lets map the results

#keggres <- gage(foldchanges, gsets = kegg.mm.sigmet, same.dir = TRUE)

#lapply(keggres, head)

lets save our greater and less than pathways

#greaters <- keggres$greater
#lessers <- keggres$less

#keggrespathways <- data.frame(id = rownames(keggres$greater), #keggres$greater) %>%
  #tbl_df() %>%
  #filter(row_number() <= 3) %>%
  #.$id %>%
  #as.character()
#keggrespathways

#keggresids <- substr(keggrespathways, start = 1, stop = 8)
#keggresids

PLOT!

plot_pathway = function(pid) pathview(gene.data = foldchange, pathway.id = pid, species = "mouse", new.signature = FALSE)

#tmp = sapply(keggresids, function(pid) pathview(gene.data = #foldchanges, pathway.id = pid, species = "mouse"))

#keggrespathways <- data.frame(id = rownames(greaters), #keggres$greater) %>%
  #tbl_df() %>%
  #filter(row_number() <= 3) %>%
  #.$id %>%
  #as.character()
#keggrespathways

#keggresids <- substr(keggrespathways, start = 1, stop = 8)
#keggresids

PLOT!

plot_pathway = function(pid) pathview(gene.data = foldchange, pathway.id = pid, species = "mouse", new.signature = FALSE)

#tmp = sapply(keggresids, function(pid) pathview(gene.data = #foldchanges, pathway.id = pid, species = "mouse"))

library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:SummarizedExperiment':
## 
##     resize, width

## The following objects are masked from 'package:GenomicRanges':
## 
##     resize, width

## The following object is masked from 'package:Biostrings':
## 
##     width

## The following object is masked from 'package:XVector':
## 
##     width

## The following objects are masked from 'package:oligoClasses':
## 
##     B, B<-

## The following objects are masked from 'package:IRanges':
## 
##     resize, width

## The following object is masked from 'package:S4Vectors':
## 
##     width

## The following object is masked from 'package:Biobase':
## 
##     channel

## The following object is masked from 'package:BiocGenerics':
## 
##     width

## The following object is masked from 'package:stringr':
## 
##     boundary

## The following object is masked from 'package:dplyr':
## 
##     where

## The following object is masked from 'package:tidyr':
## 
##     fill

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

filenames <- list.files(path = "E:/Bioinformatics/Bisc_450_Scripts/mouse_DEseq/bkb030_email_latech_edu", pattern = ".*pathview.png")

all_images <- lapply(filenames, load.image)

knitr::include_graphics(filenames)

#EdgeR vs. DESeq

library(tidyverse)

#EdgeR <- read.csv("Mouse_EdgeR_Results_Zero_vs_1.csv")

#DESeq <- read.csv("Mouse_DESeq.csv")

#DESeq2 <- DESeq %>% 
  #filter(padj < 0.1)

#DESeq2 <- DESeq2[,c(1,3)]

#EdgeR <- EdgeR[,1:2]

#colnames(DESeq2) <- c("ID", "logFC")
#colnames(EdgeR) <- c("ID", "logFC")

#install.packages("GOplot")
#library(GOplot) 

#comp <- GOVenn(DESeq2, EdgeR, label = c("DeSeq1", "EdgeR"), title = #"Comparison of DESeq and EdgeR DE Genes", plot = FALSE)

#comp$plot

#Multiple Protein Alignment

library(msa)

#seq <- readAAStringSet(file.path(getwd(), "datasets", "ch3", #"hglobin.fa"))

seq <- readAAStringSet(“hglbibin.fa”)

seq

## function (...) 
## UseMethod("seq")
## <bytecode: 0x55c703b9ebc0>
## <environment: namespace:base>

Lets align the 8 different amino acid sequences

#alignments <- msa(seq, method = "ClustalW")
#alignments

#msaPrettyPrint(alignments, output = "pdf", showNames = "left", 
               #showLogo = "none", askForOverwrite = "FALSE", 
               #verbose = TRUE, file = "whole_aligh.pdf")

#msaPrettyPrint(alignments, c(10,30), output = "pdf", showNames = #"left",
               #file = "Zoomed_align.pdf", showLogo = "top", #askForOverwrite = FALSE,
               #verbose = TRUE)

Lets make a tree from our alignment

library(ape)

## 
## Attaching package: 'ape'

## The following object is masked from 'package:imager':
## 
##     where

## The following object is masked from 'package:Biostrings':
## 
##     complement

## The following object is masked from 'package:dplyr':
## 
##     where

library(seqinr)

## 
## Attaching package: 'seqinr'

## The following objects are masked from 'package:ape':
## 
##     as.alignment, consensus

## The following object is masked from 'package:matrixStats':
## 
##     count

## The following object is masked from 'package:Biostrings':
## 
##     translate

## The following object is masked from 'package:limma':
## 
##     zscore

## The following object is masked from 'package:dplyr':
## 
##     count

Convert to seqinr alignment -> get the distances and make a tree

#alignment_seqinr <- alignment_seqinr <-msaConvert(alignments, type = #"seqinr::alignment")

#distances1 <- seqinr::dis.alignment(alignment_seqinr, "identify")

#tree <- ape::nj(distances1)
#plot(tree, main = "Phylogenetic Tree of HBA Sequences")

#Synteny

library(DECIPHER)

## Loading required package: RSQLite

## Loading required package: parallel

In the first step, we load the libraries and the sequence into long_seqs This is a DNAStringSet object ~Desktop/classroom/myfiles

#long_seq <- readRNAStringSet(file.path(getwd(), "datasets", "ch3", #"plastid_genomes.fa"))
#long_seq

Now lets build a temporary SQLite database

#seqs2DB(long_seq, "XStringSet", "long_db", names(long_seq))

Now that we’ve built the database, we can do the following: Find the syntenic blocks

#synteny <- FindSynteny("long_db")

View blocks with plotting

#pairs(synteny)
#plot(synteny)

Make an actual alignment file

#alignment <- AlignSynteny(synteny, "long_db")

Lets create a structure holding all aligned syntenic blocks for a pair of sequences

#block <- unlist(alignment[[1]])

we can write to file one align

#writeXStringSet(block, "genome_blocks_out.fa")

#library(DECIPHER)

#Unannotated Regions

library(locfit)

## locfit 1.5-9.8    2023-06-11

## 
## Attaching package: 'locfit'

## The following object is masked from 'package:purrr':
## 
##     none

library(Rsamtools)

Lets creat a function that will load the gene region information in a GFF file and convert it to a bioconductor GRanges object

get_annotated_regions_from_gff <- function(file_name) {
  gff <- rtracklayer::inport.gff(file_name)
  as(gff, "GRanges")
}

get count in windows across the genome in 500bp segments

#whole_genome <- csaw::windowCounts(

file.path(~Desktop/classroom/myfile)

  file.path(getwd(), "datasets", "ch1", "Windows.bam")

## [1] "/home/student/Desktop/classroom/myfiles/datasets/ch1/Windows.bam"

  #bin = TRUE,
  #filter = 0,
  #width = 500
  #param = csaw::readParam(
    #minq = 20, # determines what is a passing read
    #dedup = TRUE, # removes pcr duplicate
    #pe = "both" # requires that both pairs of reads are aligned

Since this is a single column of data, let’s rename it

#colnames(whole_genome) <- c("small_data")

#annotated_regions <- #get_annotated_regions_from_gff(file.path(getwd)(), "datasets", "ch1", #"genes.gff"))

Now that we have the windows of high expression, we want to see if any of them overlap with annotated regions.

library(IRanges)
library(SummarizedExperiment)

Find the overlaps between the windows we made

#windows_in_genes <- IRanges::overlapsAny(
  #SummarizedExperiment::rowRanges(whole_genome), # creates a Granges #object
  #annotated_regions
#)

#windows_in_genes

Here we subset the whole_genome object into annotated and nonannotated regions

#annotated_window_counts <- whole_genome[windows_in_genes,]
#nonannotated_window_counts <- whole_genome[!windows_in_genes,]

Use assay () to extract the actual counts from the Granges object

#assay(non_annotated_window_counts)

In this step, we us Rsamtools Puleup() function to get a per-base coverage dataframe. Each row represents a single nucleotide in the reference count and the count column gives the depth of coverage at that point

#library(bumphunter)

#pile_df <- Rsamtools::pileup(file.path(getwd(), "datasets", "ch1", #"windows.bam"))

This step groups the reads with certain distance of each other into a cluster. We give the sequence names, position and distance.

#cluster <- bumphunter::clusterMaker(pile_df$seqnames, pile_df$pos, #maxGap = 100)

#table(cluster)

In this step, we will map the reads to the regions we found for the genome

#bumphunter::regionFinder(pile_df$count, pile_df$seqnames, #pile_df$pos, clusters, cutoff = 1)

#Tree.io

Lets load the required packages

library(ggplot2)
library(ggtree)

## ggtree v3.10.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
## 
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194

## 
## Attaching package: 'ggtree'

## The following object is masked from 'package:ape':
## 
##     rotate

## The following object is masked from 'package:magrittr':
## 
##     inset

## The following object is masked from 'package:reshape':
## 
##     expand

## The following object is masked from 'package:Biostrings':
## 
##     collapse

## The following object is masked from 'package:IRanges':
## 
##     collapse

## The following object is masked from 'package:S4Vectors':
## 
##     expand

## The following object is masked from 'package:tidyr':
## 
##     expand

library(treeio)

## treeio v1.26.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
## 
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96

## 
## Attaching package: 'treeio'

## The following object is masked from 'package:seqinr':
## 
##     read.fasta

## The following object is masked from 'package:Biostrings':
## 
##     mask

First we need to load our raw tree data. Its a newick format so we use:

itol <- ape::read.tree("itol.nwk")

Now we will print out a very basic phylogenetic tree

ggtree(itol)

We can also change the format to make it a circular tree

ggtree(itol, layout = "circular")

We can also change the left-right/ up-down direction

#ggtee(itol) + cord_flip() + scale_x_reverse()

By using geom_tipla() we can add names to the end of tips

#ggtree(itol) + genom_tiplab(color = "indianred", size = 1.5)

By adding a geom_strip() layer we can annotate clades in the tree with a block of color

ggtree(itol, layour = "unrooted") +  geom_strip(13,14, color = "red", barsize = 1)

## Warning in stat_tree(data = data, mapping = mapping, geom = "segment", position = position, : Ignoring unknown parameters: `layour`
## Ignoring unknown parameters: `layour`

We can highlight clades in unrooted trees with blobs of color using geom_hilight

ggtree(itol, layout = "unrooted") + geom_hilight(node = 11, type = "encircle", fill = "steelblue")

## "daylight" method was used as default layout for unrooted tree.

## Average angle change [1] 0.174910612627282

## Average angle change [2] 0.161645191380673

## Average angle change [3] 0.129304375923319

## Average angle change [4] 0.0825706767962636

## Average angle change [5] 0.100056259084497

install.packages('BAMMtools')

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(BAMMtools)

We can use the MRCA (most recent common ancestor ) functions to find the node we want

getmrca(itol, "Photorhabdus_luminescens", "Blochmannia_floridanus")

## [1] 206

Now if we want to highlight the section of the most recent common ancester between the two above

ggtree(itol, layout = "unrooted") + geom_hilight(node = 206 , type = "encircle", fill = "steelblue")

## "daylight" method was used as default layout for unrooted tree.

## Average angle change [1] 0.174910612627282

## Average angle change [2] 0.161645191380673

## Average angle change [3] 0.129304375923319

## Average angle change [4] 0.0825706767962636

## Average angle change [5] 0.100056259084497

#Treespace

Quantifying difference between trees with treespace

First lets load the required packages

library(ape)
library(adegraphics)

## Registered S3 methods overwritten by 'adegraphics':
##   method         from
##   biplot.dudi    ade4
##   kplot.foucart  ade4
##   kplot.mcoa     ade4
##   kplot.mfa      ade4
##   kplot.pta      ade4
##   kplot.sepan    ade4
##   kplot.statis   ade4
##   scatter.coa    ade4
##   scatter.dudi   ade4
##   scatter.nipals ade4
##   scatter.pco    ade4
##   score.acm      ade4
##   score.mix      ade4
##   score.pca      ade4
##   screeplot.dudi ade4

## 
## Attaching package: 'adegraphics'

## The following object is masked from 'package:ape':
## 
##     zoom

## The following object is masked from 'package:GenomicRanges':
## 
##     score

## The following object is masked from 'package:Biostrings':
## 
##     score

## The following object is masked from 'package:BiocGenerics':
## 
##     score

library(treespace)

## Loading required package: ade4

## 
## Attaching package: 'ade4'

## The following objects are masked from 'package:adegraphics':
## 
##     kplotsepan.coa, s.arrow, s.class, s.corcircle, s.distri, s.image,
##     s.label, s.logo, s.match, s.traject, s.value, table.value,
##     triangle.class

## The following object is masked from 'package:GenomicRanges':
## 
##     score

## The following object is masked from 'package:Biostrings':
## 
##     score

## The following object is masked from 'package:BiocGenerics':
## 
##     score

Fow we need to load all the tree files into a multi Phylo object

#treefiles <- list.files9file.path(getwd(),"gene_trees") full.names = #TRUE
#tree_list <- lapply(treefiles, read.tree)
#class(tree_list)

#class(tree-list) <- "multiphylo"

#class(tree_list)

Now we can compute the kendall-coljin distance between trees. This function does a LOT of analysis.

First it runs a pairwise comparison of all trees in the input

Second it carries out clustering using PCA

These results are returned in a list of objects,where $D contains the pairwise

Metric of the trees, and $pco contains the PCA. The method we use (Kendal-Coljin) is particularly suitable for rooted trees as we have here. The option NF tells us how many principal components to retain.

#comparison <- treespace(tree_list, nf = 3)

We can plot the pairwise distance between trees with table.image

#table.image(comparisons$D, nclass= 25)

Now lets print the PCA and clusters, this shows us how the group of trees cluster

#plotGroves(comparison$pco, lab.show = TRUE, lab.cex = 1.5)

#groves <- findGroves(comparisons, nclust = 2)
#plotGroves(groves)

#Binding Trees

Extracting and working with subtrees using APE

load our required package

library(ape)

Now lets load the tree data we will be working with

newick <- read.tree("mammal_tree.nwk")

#1 <- subtrees(newick)

Lets plot the tree to see what it looks like

plot(newick)

We can subset this plot using the “node” function

#plot(1[[4]], sub = "Node 4")

Extract the tree manually

small_tree <- extract.clade(newick, 9)

plot(small_tree)

Now what if we want to bind two trees together.

new_tree <- bind.tree(newick, small_tree, 4)
plot(new_tree)

#Trees From Alignment

Reconstructing trees from alignment

Lets load the packages

library(Biostrings)
library(msa)
library(phangorn)

First we’ll load the sequences into a seqs variable

seqs <- readAAStringSet("abc.fa")

Now, lets construct an aliognment with the msa package and ClustalOmega

aln <- msa::msa(seqs, method = c("ClustalOmega"))

## using Gonnet

To create a tree, we need to convert the alignment to a phyData objects

aln <- as.phyDat(aln, type = "AA")

class(aln)

## [1] "phyDat"

In this step, we/ll actually make the trees. Tres are made from a distance matrix, which can be computed with dist.ml() - ML stands for maximum liklihood

dist_mat <- dist.ml(aln)

Now we pass the distance matrix to upgma to make a UPGMA tree.

upgma_tree <- upgma(dist_mat)
plot(upgma_tree, main = "UPGMA")

We can conversley pass the distance matrix to a neighbor joining function

#nj_tree <- NJ(dist-mat)
#plot(nj_tree, main = "NJ")

Lastly, we are oging to use the bootstraps.phyDat() function to compute bootstrap support for the branches of the tree. The first argument is the object (aln), while the second argument in the ufunction ng

Bootstraps are e3ssentially confidence intervals for how the clade is placed in the correct position.

#fit <- plm(nj_tree, aln)

#bootstraps <- bootstrap.phyDat(aln, FUN = function(x) {NJ(dist.ml)}, #bs =  100)

#plotBS(nj_tree, bootstraps, p = 10 )

#Identifying Variants

First lets load the required libraries

library(GenomicRanges)
library(rtracklayer)
library(VariantAnnotation)

## 
## Attaching package: 'VariantAnnotation'

## The following object is masked from 'package:stringr':
## 
##     fixed

## The following object is masked from 'package:base':
## 
##     tabulate

#library(gmapR)
library(VariantTools)

Now we want to load our datasets. We need . Bam and . fa files for this pipeline to work

bam_file <- file.path(getwd(), "hg17_snps.bam")

fasta_file <- file.path(getwd(), "chr17.83k.fa")

Now we need to set up the genome object and the paramters object:

fa <- rtracklayer::FastaFile(fasta_file)

Now we create a GMapGenome object, which describes the genome to the later variant calling function

#genome <- gmapR:: GmapGenome(fa, create = TRUE)

This next step sets our parameter for what is considered a variant. There can be a lot of fine-tuning to this function. We are just going to use the standard setting

#qual_params <- TallyVariantsParam(
  #genome = genome,
  #minimum_mapq = 20)

#var_params <- VariantCallingFilter(read.count = 19, p.lower = 0.01)

Now we use CallVariants function in accordance with our paramters we defined above

#called_variants <- callVariants(bam_file,
                                #qual_params,
                                #calling.filter = var_params)

#head(called_variants)

we have identified 6 variants

Now, we move on to annotation and load the feature position information from gff

get_annotated_regions_from_gff <- function(file_name) {
  gff <- rtracklayer::import.gff(file_name)
  as(gff, "GRanges")
}

Note you can also load this data from a bed file

genes <- get_annotated_regions_from_gff("chr17.83k.gff3")

Now we can calculate which variants overlap which genes

#overlaps <- GenomicRanges::findOverlaps(called_variants, genes)

#overlaps

and lastly we subset the genes with the list of overlaps

#genes [subjectHits(overlaps)]

#Open Reading Frames

First thing, lets load the required packages

library(Biostrings)
library(systemPipeR)

## Loading required package: ShortRead

## Loading required package: BiocParallel

## Loading required package: GenomicAlignments

## 
## Attaching package: 'GenomicAlignments'

## The following object is masked from 'package:dplyr':
## 
##     last

## 
## Attaching package: 'ShortRead'

## The following object is masked from 'package:adegraphics':
## 
##     zoom

## The following objects are masked from 'package:locfit':
## 
##     left, right

## The following object is masked from 'package:ape':
## 
##     zoom

## The following object is masked from 'package:imager':
## 
##     clean

## The following object is masked from 'package:magrittr':
## 
##     functions

## The following object is masked from 'package:oligo':
## 
##     intensity

## The following objects are masked from 'package:oligoClasses':
## 
##     chromosome, position

## The following object is masked from 'package:affy':
## 
##     intensity

## The following object is masked from 'package:dplyr':
## 
##     id

## The following object is masked from 'package:purrr':
## 
##     compose

## The following object is masked from 'package:tibble':
## 
##     view

## 
## Attaching package: 'systemPipeR'

## The following object is masked from 'package:VariantAnnotation':
## 
##     reference

## The following object is masked from 'package:DESeq2':
## 
##     results

Lets load the data into a DNAstrings object, in this case, an Arabiopsis chloroplast # genome

dna_object <- readDNAStringSet("arabidopsis_chloroplast.fa")

Now lets predict the open reading frames which predORF(), we’ll predict all ORF on both strands.

#predict_orfs <- predORF(dna_objects, n = 'all', type = 'gr', mode = #'ORF', strand = 'both',
                        #longest_disjoint = TRUE)
#predict_orfs

This printed out a GRanges object in return, which 2,501 open reading frames. This is FAR too many open reading frames.

to filter out erronous ORFs, we do a simulation. We first estimate the lengthen ORF can reach by chance. We will create a string of random nucleotides that is the length of our chloroplast genome and determine the longer ORF that can arise by chance.

bases <- c("A", "T", "G", "C")
raw_seq_string <- strsplit(as.character(dna_object), "")

Now we need to ensure that our random nucleotides match the proportion of nucleotides in our chloroplast genome so we have no bias.

seq_length <- width(dna_object[1])
#counts <- lapply(bases, function(x) {sum(grep1(x, raw_seq_string))})
# <- unlist(lapply(counts, #function(base_count){signif(base_count/seq_length, 2)}))
#probs

Now we can build our function to simulate a genome

#get_longest_orf_in_random_genome <- fnction(x,
                                            #length = 1000,
                                            #probs = c(0.25, 0.25, #0.25, 0.25),
                                            #bases = c("A", "T", "G", #"C")){
  # Here we create our random genome and allow replaement for the next iteration
 # random_genome <- paste0(sample(bases, size = length, replace = TRUE, prob = probs), collapse = "")
  #random_dna_objects <- DNAStringsSet(random_genome)
  #names(random_dna_object) <- c("random_dna_string")
  
  #orfs <- predORF(random_dna_objects, n =1, type = 'gr', mode = #'ORF', strand = 'both', longest_disjoint = TRUE)
  #return(max(width(orfs)))
#}

Now we use the function we just created to predict the ORFs in 10 random genomes

#random_lengths <- unlist(lapply(1:10, #get_longest_orf_in_random_genome, length = seq_length, probs = probs, #bases = bases))

Lets pull out the longest length from. our 10 simulations

#longest_random_orf <- max(random_lengths)

Lets only keep the frames that are longer in our chloroplast genome

#keep <- width(predict_orfs) > longest_random_orf

#orfs_to_keep <- predict_orfs [keep]
#orfs_to_keep

Write this data to file.

#extracted_orfs <- BSgenome::getSeq(dna_object, orfs_to_keep)
#names(extracted_orfs) <- paste0("orf_", 1:length(orfs_to_keep))
#writeXStringSet(extracted_orfs, "saved_orfs.fa")

#Karyoplot

First lets load the required packages

library(karyoploteR)

## Loading required package: regioneR

library(GenomicRanges)

Now we need to set up the genome object for our karyotype

#genome_df <- data.frame(

first we dictate the number of chromosomes

  #chr = paste0 ("chr", 1:5),
  #start = rep (1, 5),

and then we will dictate the length of each chromosome

  end = c (34964571, 22037565, 25499034, 20862711, 31270811)
#)

Now we convert the data frame to a granges object

#genome_gr <- makeGRangesFromDataFrame(genome_df)

Now lets create some snp positions to map out. We do this by using the sample() function

snp_pos <- sample(1:1e7, 25)
#snps <- data.frame(
  #chr = paste0("chr", smaple(1:5,25, replace = TRUE)),
  #start = snp_pos,
  #end = snp_pos
#)

again we convert the dataframe to granges

#snps_gr <- makeGRangesFromDataFrame (snps)

Now lets create some snp labels

#snp_labels <- paste0("snp_", 1:25)

Here we will set the margins for our plot

plot.params <- getDefaultPlotParams(plot.type =1)

Here we will set the margins of our plot

plot.params$data1outmargin <- 600

Now lets plot our snps

#kp <- plotKaryotype(genome= genome_gr, plot.type = 1, plot.params = plot.params)
#kpPlotmarkers(kp, snps_gr, labels = snap_labels)

We can also add some numeric data to our plots. We will generate 100 random numbers that plot to 100 windows on chromosome 5

#numeric_data <- data.frame(
  #y = rnorm(100, mean. = 1, sd = 0.5),
  #chr = rep("chr4", 100), 
  #start = seq(1, 20862711, 20862711/100),
  #end = seq(1,20682711, 20862711/100)
#)

Now lets make the data a granges object

#numeric_data_gr <- makeGRangesFromDataFrame(numeric_data)

Again lets set our plot parameters

plot.params <- getDefaultPlotParams(plot.type = 2)
plot.params$data1outmargin <- 800
plot.params$data2outmargin <- 800
plot.params$topmargin <- 800

Lets plot the data

#kp <- plotKaryotype(genome= genome_gr, plot.type = 2, plot.params = plot.params)
#kpPlotmarkers(kp, snps_gr, labels =  snp_labels)
#kpLines(kp, numeric_data_gr, y = numeric_data$y, data.panel = 2)

I had to use hashtags for the code that would not run.

Edge.R

Andrew Bolton

2023-11-17