Homework 1, Part 2

Bharat Singh

We start off with the setup code provided in the assignment.

require("genoset")

## Loading required package: genoset
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: XVector

# Gene models are provided by this library you may need to install it with
# biocLite('TxDb.Hsapiens.UCSC.hg19.knownGene') it may take a while
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi


# reassign to shorter variable name
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

# you can extract ranges of interest using accessor functions see the
# `GenomicFeatures` package to see how they work

# here are some ranges of interest

# genes: GRanges object
allGenes <- genes(txdb)

# exons: GRanges object
allExons <- exons(txdb)

# transcripts: GRangesList object
allTranscripts <- transcripts(txdb)

# exons grouped by gene: GRangesList
exonsByGene <- exonsBy(txdb, by = "gene")

# transcripts grouped by gene: GRangesList
transcriptsByGene <- transcriptsBy(txdb, by = "gene")

# exons grouped by transcripts: GRangesList
exonsByTx <- exonsBy(txdb, by = "tx")

Questions 1-8

Question 1.

numGenes <- length(allGenes)

There are 23056 genes in the annotation.

Question 2.

numExons <- length(allExons)
avgExonsPerGene <- numExons/numGenes

There are 12.5767 exons per gene.

Question 3.

numTranscripts <- length(allTranscripts)
numTranscriptsPerGene <- numTranscripts/numGenes

There are 3.5982 transcripts per gene.

Question 4.

rangeExon = ranges(allExons)
lengthExon = end(rangeExon) - start(rangeExon) + 1
hist(lengthExon[lengthExon < 600], main = "Exon length (up to 600 bp)")

plot of chunk Q4

The median length for an exon is 137 bp the mode is located between 50 and 150 bp. The mean length is 340.5857 bp. The distribution has a substantial tail as can be seen from the mean. Axons upto length 600bp are only plotted in the histogram, since the number of elements falling in each bin is very less after that.

Question 5.

rangeTrans = ranges(allTranscripts)
lengthTrans = end(rangeTrans) - start(rangeTrans) + 1
hist(lengthTrans[lengthTrans < 80000])

plot of chunk Q5

The length of transcripts decreases exponentially fast. It becomes very less after 80000, hence we truncate the plot after that.

Question 6.

rangeGene = ranges(allGenes)
lengthGene = end(rangeGene) - start(rangeGene) + 1
hist(lengthGene[lengthGene < 1e+05])

plot of chunk Q6

Question 7.

positives = strand(allGenes) == "+"
negatives = strand(allGenes) == "-"
pos = allGenes[positives]
neg = allGenes[negatives]
strand(pos) = "*"
o <- findOverlaps(pos, neg)

There are 3001 out of 11319 negative genes and 11737 positive generes which overlap with each other

###Question 8

library(plotrix)

posRangeGene <- ranges(pos)
posLengthGene <- end(posRangeGene) - start(posRangeGene) + 1
negRangeGene <- ranges(neg)
negLengthGene <- end(negRangeGene) - start(negRangeGene) + 1
cLength <- list(posLengthGene[posLengthGene < 1e+05], negLengthGene[negLengthGene < 
    1e+05])
labels <- c("pos", "neg")
multhist(cLength, beside = TRUE, main = "Gene Length Distribution Across Strands < 100000", 
    labels = TRUE)
legend("topright", c("pos strand", "neg strand"), fil = c("black", "white"), 
    title = "Legend")

plot of chunk Q8

It is visible from the plot that there is not much difference in the gene length distribution across positive or negative strands, however for very small strands, the positive strands are longer, then negative strands are longer marginally, and along the tail of the distribution there is again a slight bias towards the positive strands.

# Exercise 1 Promoter regions:
pro <- promoters(allTranscripts, upstream = 1000, downstream = 200)

## Warning: trimmed start values to be positive
## Warning: trimmed end values to be <= seqlengths


# Question 9
library(AnnotationHub)
ah <- AnnotationHub()
cgi <- ah$goldenpath.hg19.database.cpgIslandExt_0.0.1.RData

ocg = findOverlaps(cgi, pro)
length(table(queryHits(ocg)))

## [1] 15317

It can be seen that 15317 out of the total 28691 promoter regions overlap with the UCSC CpG islands, which is 53.38 percent

library(BSgenome.Hsapiens.UCSC.hg19)

## Loading required package: BSgenome
## Loading required package: Biostrings
## 
## Attaching package: 'BSgenome'
## 
## The following object is masked from 'package:AnnotationDbi':
## 
##     species

chr16 <- Hsapiens[["chr16"]]
# Get non overlapping windows of length 256
ichunks <- IRanges::breakInChunks(length(chr16), 256)
chunks <- Views(chr16, ichunks)

## Warning: masks were dropped

# get rid of Ns:
nn <- letterFrequency(chunks, "N")
nnchunks <- chunks[nn == 0]
na <- letterFrequency(nnchunks, "A")
nc <- letterFrequency(nnchunks, "C")
ng <- letterFrequency(nnchunks, "G")
nt <- letterFrequency(nnchunks, "T")

# count number of CpGs in the window
nCpG <- vcountPattern("CG", nnchunks)

# Then the expected number of CpG's is
exp <- nc * ng/256

# O/E is
ooe = nCpG[exp > 0]/exp[exp > 0]
hist(ooe, breaks = 600, xlim = c(0.1, 1.2))

plot of chunk Ex 2-8

We see that there is a peak at around 0.2 and then the ditribution tapers off, it looks like a poisson distribution to some extent

#####Exercise 5

# The GC content (N_C+N_G) inside and outside promoter regions

a = seqnames(pro) == "chr16"
b = seqnames(allTranscripts) == "chr16"

pro_r <- ranges(pro[a])
pro_chunks <- Views(chr16, pro_r)

## Warning: masks were dropped

pro_nn <- letterFrequency(pro_chunks, "N")
pro_nnchunks <- pro_chunks[pro_nn == 0]
pro_nc <- letterFrequency(pro_nnchunks, "C")
pro_ng <- letterFrequency(pro_nnchunks, "G")
pro_nCpG <- vcountPattern("CG", pro_nnchunks)
pro_nc_ng <- (pro_nc + pro_ng)/width(pro_nnchunks)
proRanged <- RangedData(ranges(pro_nnchunks), pro_nc_ng)
# hist(proRanged$C);
genoPlot(x = proRanged, y = proRanged$C, main = "Plot of N_C+N_G Content Inside and Outside Promoter Regions", 
    pch = "*", col = "blue", xlab = "Interval", ylab = "Percent N_C+N_G")

# Outside the promoter region
outside_pro <- setdiff(IRanges(ichunks), IRanges(pro_r))
outside_nc <- letterFrequency(Views(chr16, outside_pro), "C")
outside_ng <- letterFrequency(Views(chr16, outside_pro), "G")
outside_nc_ng <- (outside_nc + outside_ng)/width(outside_pro)
outsideRanged <- RangedData(outside_pro, outside_nc_ng)
# Add plot to orginal
genoPlot(x = outsideRanged, y = outsideRanged$C, add = TRUE, col = "red", pch = "+")
legend("bottomright", c("inside", "outside"), fil = c("blue", "red"), title = "Legend")

plot of chunk Exercise 5

#####Exercise 6

# The GC content (N_CG) inside and outside promoter regions

a = seqnames(pro) == "chr16"
b = seqnames(allTranscripts) == "chr16"

pro_r <- ranges(pro[a])
pro_chunks <- Views(chr16, pro_r)

## Warning: masks were dropped

pro_nn <- letterFrequency(pro_chunks, "N")
pro_nnchunks <- pro_chunks[pro_nn == 0]
pro_nc <- letterFrequency(pro_nnchunks, "C")
pro_g <- letterFrequency(pro_nnchunks, "G")
pro_nCpG <- vcountPattern("CG", pro_nnchunks)
pro_nCpG <- (pro_nCpG)/width(pro_nnchunks)
proRanged <- RangedData(ranges(pro_nnchunks), pro_nCpG)
genoPlot(x = proRanged, y = proRanged$pro_nCpG, main = "Plot of N_CG Content Inside and Outside Promoter Regions", 
    pch = "*", col = "blue", xlab = "Interval", ylab = "Percent N_CG")

# Outside the promoter region
outside_pro <- setdiff(IRanges(ichunks), IRanges(pro_r))
outside_nc <- letterFrequency(Views(chr16, outside_pro), "C")
outside_ng <- letterFrequency(Views(chr16, outside_pro), "G")
outside_nCpG <- vcountPattern("CG", Views(chr16, outside_pro))
outside_nCpG <- (outside_nCpG)/width(outside_pro)
outsideRanged <- RangedData(outside_pro, outside_nCpG)
# Add plot to orginal
genoPlot(x = outsideRanged, y = outsideRanged$outside_nCpG, add = TRUE, col = "red", 
    pch = "+")
legend("bottomright", c("inside", "outside"), fil = c("blue", "red"), title = "Legend")

plot of chunk Exercise 6

From the plot we can see that the Nc+Ng content outside promoter regions is lower as compared to the inside regions.

#####Exercise 7

islands <- seqnames(cgi) == "chr16"
chrs <- chr16

island_r <- ranges(cgi[islands])
island_chunks <- Views(chr16, island_r)

## Warning: masks were dropped

island_nn <- letterFrequency(island_chunks, "N")
island_nnchunks <- pro_chunks[island_nn == 0]

nCpG <- vcountPattern("CG", island_nnchunks)
island_nc <- letterFrequency(island_nnchunks, "C")
island_ng <- letterFrequency(island_nnchunks, "G")
exp <- island_nc * island_ng/width(island_r)

## Warning: longer object length is not a multiple of shorter object length

OoEIn = nCpG[exp > 0]/exp[exp > 0]
hist(OoEIn, xlim = c(0.1, 1), breaks = 200)

plot of chunk Exercise 7


outside_island <- setdiff(IRanges(ichunks), IRanges(island_r))
outside_nc <- letterFrequency(Views(chr16, outside_island), "C")
outside_ng <- letterFrequency(Views(chr16, outside_island), "G")
outside_nCpG <- vcountPattern("CG", Views(chr16, outside_island))
outside_exp <- outside_nc * outside_ng/width(outside_island)

## Warning: NAs produced by integer overflow

OoEOut = outside_nCpG[outside_exp > 0]/outside_exp[outside_exp > 0]
OoEOut[which(is.na(OoEOut))] <- 0
hist(OoEOut, xlim = c(0.1, 1), breaks = 100)

plot of chunk Exercise 7

surprizingly, in this case, the observed over expected ratio is greater outside CpG islands.

#####Exercise 8

library(lattice)

C <- (nc + ng)/256
perCpG <- nCpG/256
CpG_df <- data.frame(C, perCpG)

## Error: arguments imply differing number of rows: 308138, 3212


cpgStrat <- shingle(CpG_df$C, intervals = rbind(c(0, 0.5), c(0.5, 0.6), c(0.6, 
    0.65), c(0.65, 0.7), c(0.7, 0.75), c(0.75, 0.8), c(0.8, 1)))

## Error: object 'CpG_df' not found


labels <- c("[0,0.5]", "(0.5,0.6]", "(0.6,0.65]", "(0.65,0.7]", "(0.7,0.75]", 
    "(0.75,0.8]", "(0.8,1]")

histogram(~CpG_df$perCpG | cpgStrat, main = "Proportion of CpG's Stratified By CG Content", 
    xlab = "CpG rate", layout = (c(1, 7)), strip = strip.custom(factor.levels = labels, 
        strip.levels = TRUE, strip.names = FALSE), scales = list(y = "free"))

## Error: object 'cpgStrat' not found

We see that CpG rate is bimodal when GC content is higher and turns to somewhat like a poisson distribution for lower CpG rate