We start off with the setup code provided in the assignment.
require("genoset")
## Loading required package: genoset
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: XVector
# Gene models are provided by this library you may need to install it with
# biocLite('TxDb.Hsapiens.UCSC.hg19.knownGene') it may take a while
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
# reassign to shorter variable name
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
# you can extract ranges of interest using accessor functions see the
# `GenomicFeatures` package to see how they work
# here are some ranges of interest
# genes: GRanges object
allGenes <- genes(txdb)
# exons: GRanges object
allExons <- exons(txdb)
# transcripts: GRangesList object
allTranscripts <- transcripts(txdb)
# exons grouped by gene: GRangesList
exonsByGene <- exonsBy(txdb, by = "gene")
# transcripts grouped by gene: GRangesList
transcriptsByGene <- transcriptsBy(txdb, by = "gene")
# exons grouped by transcripts: GRangesList
exonsByTx <- exonsBy(txdb, by = "tx")
numGenes <- length(allGenes)
There are 23056 genes in the annotation.
numExons <- length(allExons)
avgExonsPerGene <- numExons/numGenes
There are 12.5767 exons per gene.
numTranscripts <- length(allTranscripts)
numTranscriptsPerGene <- numTranscripts/numGenes
There are 3.5982 transcripts per gene.
rangeExon = ranges(allExons)
lengthExon = end(rangeExon) - start(rangeExon) + 1
hist(lengthExon[lengthExon < 600], main = "Exon length (up to 600 bp)")
The median length for an exon is 137 bp the mode is located between 50 and 150 bp. The mean length is 340.5857 bp. The distribution has a substantial tail as can be seen from the mean. Axons upto length 600bp are only plotted in the histogram, since the number of elements falling in each bin is very less after that.
rangeTrans = ranges(allTranscripts)
lengthTrans = end(rangeTrans) - start(rangeTrans) + 1
hist(lengthTrans[lengthTrans < 80000])
The length of transcripts decreases exponentially fast. It becomes very less after 80000, hence we truncate the plot after that.
rangeGene = ranges(allGenes)
lengthGene = end(rangeGene) - start(rangeGene) + 1
hist(lengthGene[lengthGene < 1e+05])
positives = strand(allGenes) == "+"
negatives = strand(allGenes) == "-"
pos = allGenes[positives]
neg = allGenes[negatives]
strand(pos) = "*"
o <- findOverlaps(pos, neg)
There are 3001 out of 11319 negative genes and 11737 positive generes which overlap with each other
###Question 8
library(plotrix)
posRangeGene <- ranges(pos)
posLengthGene <- end(posRangeGene) - start(posRangeGene) + 1
negRangeGene <- ranges(neg)
negLengthGene <- end(negRangeGene) - start(negRangeGene) + 1
cLength <- list(posLengthGene[posLengthGene < 1e+05], negLengthGene[negLengthGene <
1e+05])
labels <- c("pos", "neg")
multhist(cLength, beside = TRUE, main = "Gene Length Distribution Across Strands < 100000",
labels = TRUE)
legend("topright", c("pos strand", "neg strand"), fil = c("black", "white"),
title = "Legend")
It is visible from the plot that there is not much difference in the gene length distribution across positive or negative strands, however for very small strands, the positive strands are longer, then negative strands are longer marginally, and along the tail of the distribution there is again a slight bias towards the positive strands.
# Exercise 1 Promoter regions:
pro <- promoters(allTranscripts, upstream = 1000, downstream = 200)
## Warning: trimmed start values to be positive
## Warning: trimmed end values to be <= seqlengths
# Question 9
library(AnnotationHub)
ah <- AnnotationHub()
cgi <- ah$goldenpath.hg19.database.cpgIslandExt_0.0.1.RData
ocg = findOverlaps(cgi, pro)
length(table(queryHits(ocg)))
## [1] 15317
It can be seen that 15317 out of the total 28691 promoter regions overlap with the UCSC CpG islands, which is 53.38 percent
library(BSgenome.Hsapiens.UCSC.hg19)
## Loading required package: BSgenome
## Loading required package: Biostrings
##
## Attaching package: 'BSgenome'
##
## The following object is masked from 'package:AnnotationDbi':
##
## species
chr16 <- Hsapiens[["chr16"]]
# Get non overlapping windows of length 256
ichunks <- IRanges::breakInChunks(length(chr16), 256)
chunks <- Views(chr16, ichunks)
## Warning: masks were dropped
# get rid of Ns:
nn <- letterFrequency(chunks, "N")
nnchunks <- chunks[nn == 0]
na <- letterFrequency(nnchunks, "A")
nc <- letterFrequency(nnchunks, "C")
ng <- letterFrequency(nnchunks, "G")
nt <- letterFrequency(nnchunks, "T")
# count number of CpGs in the window
nCpG <- vcountPattern("CG", nnchunks)
# Then the expected number of CpG's is
exp <- nc * ng/256
# O/E is
ooe = nCpG[exp > 0]/exp[exp > 0]
hist(ooe, breaks = 600, xlim = c(0.1, 1.2))
We see that there is a peak at around 0.2 and then the ditribution tapers off, it looks like a poisson distribution to some extent
#####Exercise 5
# The GC content (N_C+N_G) inside and outside promoter regions
a = seqnames(pro) == "chr16"
b = seqnames(allTranscripts) == "chr16"
pro_r <- ranges(pro[a])
pro_chunks <- Views(chr16, pro_r)
## Warning: masks were dropped
pro_nn <- letterFrequency(pro_chunks, "N")
pro_nnchunks <- pro_chunks[pro_nn == 0]
pro_nc <- letterFrequency(pro_nnchunks, "C")
pro_ng <- letterFrequency(pro_nnchunks, "G")
pro_nCpG <- vcountPattern("CG", pro_nnchunks)
pro_nc_ng <- (pro_nc + pro_ng)/width(pro_nnchunks)
proRanged <- RangedData(ranges(pro_nnchunks), pro_nc_ng)
# hist(proRanged$C);
genoPlot(x = proRanged, y = proRanged$C, main = "Plot of N_C+N_G Content Inside and Outside Promoter Regions",
pch = "*", col = "blue", xlab = "Interval", ylab = "Percent N_C+N_G")
# Outside the promoter region
outside_pro <- setdiff(IRanges(ichunks), IRanges(pro_r))
outside_nc <- letterFrequency(Views(chr16, outside_pro), "C")
outside_ng <- letterFrequency(Views(chr16, outside_pro), "G")
outside_nc_ng <- (outside_nc + outside_ng)/width(outside_pro)
outsideRanged <- RangedData(outside_pro, outside_nc_ng)
# Add plot to orginal
genoPlot(x = outsideRanged, y = outsideRanged$C, add = TRUE, col = "red", pch = "+")
legend("bottomright", c("inside", "outside"), fil = c("blue", "red"), title = "Legend")
#####Exercise 6
# The GC content (N_CG) inside and outside promoter regions
a = seqnames(pro) == "chr16"
b = seqnames(allTranscripts) == "chr16"
pro_r <- ranges(pro[a])
pro_chunks <- Views(chr16, pro_r)
## Warning: masks were dropped
pro_nn <- letterFrequency(pro_chunks, "N")
pro_nnchunks <- pro_chunks[pro_nn == 0]
pro_nc <- letterFrequency(pro_nnchunks, "C")
pro_g <- letterFrequency(pro_nnchunks, "G")
pro_nCpG <- vcountPattern("CG", pro_nnchunks)
pro_nCpG <- (pro_nCpG)/width(pro_nnchunks)
proRanged <- RangedData(ranges(pro_nnchunks), pro_nCpG)
genoPlot(x = proRanged, y = proRanged$pro_nCpG, main = "Plot of N_CG Content Inside and Outside Promoter Regions",
pch = "*", col = "blue", xlab = "Interval", ylab = "Percent N_CG")
# Outside the promoter region
outside_pro <- setdiff(IRanges(ichunks), IRanges(pro_r))
outside_nc <- letterFrequency(Views(chr16, outside_pro), "C")
outside_ng <- letterFrequency(Views(chr16, outside_pro), "G")
outside_nCpG <- vcountPattern("CG", Views(chr16, outside_pro))
outside_nCpG <- (outside_nCpG)/width(outside_pro)
outsideRanged <- RangedData(outside_pro, outside_nCpG)
# Add plot to orginal
genoPlot(x = outsideRanged, y = outsideRanged$outside_nCpG, add = TRUE, col = "red",
pch = "+")
legend("bottomright", c("inside", "outside"), fil = c("blue", "red"), title = "Legend")
From the plot we can see that the Nc+Ng content outside promoter regions is lower as compared to the inside regions.
#####Exercise 7
islands <- seqnames(cgi) == "chr16"
chrs <- chr16
island_r <- ranges(cgi[islands])
island_chunks <- Views(chr16, island_r)
## Warning: masks were dropped
island_nn <- letterFrequency(island_chunks, "N")
island_nnchunks <- pro_chunks[island_nn == 0]
nCpG <- vcountPattern("CG", island_nnchunks)
island_nc <- letterFrequency(island_nnchunks, "C")
island_ng <- letterFrequency(island_nnchunks, "G")
exp <- island_nc * island_ng/width(island_r)
## Warning: longer object length is not a multiple of shorter object length
OoEIn = nCpG[exp > 0]/exp[exp > 0]
hist(OoEIn, xlim = c(0.1, 1), breaks = 200)
outside_island <- setdiff(IRanges(ichunks), IRanges(island_r))
outside_nc <- letterFrequency(Views(chr16, outside_island), "C")
outside_ng <- letterFrequency(Views(chr16, outside_island), "G")
outside_nCpG <- vcountPattern("CG", Views(chr16, outside_island))
outside_exp <- outside_nc * outside_ng/width(outside_island)
## Warning: NAs produced by integer overflow
OoEOut = outside_nCpG[outside_exp > 0]/outside_exp[outside_exp > 0]
OoEOut[which(is.na(OoEOut))] <- 0
hist(OoEOut, xlim = c(0.1, 1), breaks = 100)
surprizingly, in this case, the observed over expected ratio is greater outside CpG islands.
#####Exercise 8
library(lattice)
C <- (nc + ng)/256
perCpG <- nCpG/256
CpG_df <- data.frame(C, perCpG)
## Error: arguments imply differing number of rows: 308138, 3212
cpgStrat <- shingle(CpG_df$C, intervals = rbind(c(0, 0.5), c(0.5, 0.6), c(0.6,
0.65), c(0.65, 0.7), c(0.7, 0.75), c(0.75, 0.8), c(0.8, 1)))
## Error: object 'CpG_df' not found
labels <- c("[0,0.5]", "(0.5,0.6]", "(0.6,0.65]", "(0.65,0.7]", "(0.7,0.75]",
"(0.75,0.8]", "(0.8,1]")
histogram(~CpG_df$perCpG | cpgStrat, main = "Proportion of CpG's Stratified By CG Content",
xlab = "CpG rate", layout = (c(1, 7)), strip = strip.custom(factor.levels = labels,
strip.levels = TRUE, strip.names = FALSE), scales = list(y = "free"))
## Error: object 'cpgStrat' not found
We see that CpG rate is bimodal when GC content is higher and turns to somewhat like a poisson distribution for lower CpG rate