Q1: What fraction of reads in this file has an ‘A’ nucleotide in the 5th base of the read?

# source("https://bioconductor.org/biocLite.R")
# biocLite("yeastRNASeq")

library(yeastRNASeq)
library(ShortRead)

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit

## Loading required package: BiocParallel

## Loading required package: Biostrings

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums

## Loading required package: IRanges

## Loading required package: XVector

## Loading required package: Rsamtools

## Loading required package: GenomeInfoDb

## Loading required package: GenomicRanges

## Loading required package: GenomicAlignments

## Loading required package: SummarizedExperiment

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

fastqFilePath <-  system.file("reads", 
                              "wt_1_f.fastq.gz", 
                              package = "yeastRNASeq")

fqFile <- FastqFile(fastqFilePath)
reads <- readFastq(fqFile)
reads_set <- sread(reads)
sum(DNAStringSet(reads_set, 5, 5) == "A") / length(reads_set)

## [1] 0.363841

Q2: What is the average numeric quality value of the 5th base of these reads?

qm <- as(quality(reads), "matrix")
mean(qm[, 5:5])

## [1] 28.93346

Q3: In this interval, how many reads are duplicated by position?

# source("https://bioconductor.org/biocLite.R")
# biocLite("leeBamViews")

library(leeBamViews)

## Loading required package: BSgenome

## Loading required package: rtracklayer

bamFilePath <- system.file("bam", 
                           "isowt5_13e.bam", 
                           package="leeBamViews")

# In this interval, how many reads are duplicated by position?
bamFile <- BamFile(bamFilePath)
seqinfo(bamFile)

## Seqinfo object with 17 sequences from an unspecified genome:
##   seqnames seqlengths isCircular genome
##   Scchr01      230208       <NA>   <NA>
##   Scchr02      813178       <NA>   <NA>
##   Scchr03      316617       <NA>   <NA>
##   Scchr04     1531919       <NA>   <NA>
##   Scchr05      576869       <NA>   <NA>
##   ...             ...        ...    ...
##   Scchr13      924429       <NA>   <NA>
##   Scchr14      784334       <NA>   <NA>
##   Scchr15     1091289       <NA>   <NA>
##   Scchr16      948062       <NA>   <NA>
##   Scmito        85779       <NA>   <NA>

aln <- scanBam(bamFile)
aln <- aln[[1]]  
names(aln)

##  [1] "qname"  "flag"   "rname"  "strand" "pos"    "qwidth" "mapq"  
##  [8] "cigar"  "mrnm"   "mpos"   "isize"  "seq"    "qual"

lapply(aln, function(xx) xx[1])

## $qname
## [1] "GA-EAS46_1_209DH:5:14:206:926"
## 
## $flag
## [1] 16
## 
## $rname
## [1] Scchr13
## 17 Levels: Scchr01 Scchr02 Scchr03 Scchr04 Scchr05 Scchr06 ... Scmito
## 
## $strand
## [1] -
## Levels: + - *
## 
## $pos
## [1] 799975
## 
## $qwidth
## [1] 36
## 
## $mapq
## [1] 25
## 
## $cigar
## [1] "36M"
## 
## $mrnm
## [1] <NA>
## 17 Levels: Scchr01 Scchr02 Scchr03 Scchr04 Scchr05 Scchr06 ... Scmito
## 
## $mpos
## [1] 0
## 
## $isize
## [1] 0
## 
## $seq
##   A DNAStringSet instance of length 1
##     width seq
## [1]    36 GTTATCCAAGTCTTTTTAGTTACCACTCTATCCTCC
## 
## $qual
##   A PhredQuality instance of length 1
##     width seq
## [1]    36 JONSHWfURPJUgXcadYKhfUhh^hhhhhhhh^hh

unique(aln$rname)

## [1] Scchr13
## 17 Levels: Scchr01 Scchr02 Scchr03 Scchr04 Scchr05 Scchr06 ... Scmito

gr <- GRanges(seqnames = "Scchr13", 
              ranges = IRanges(start = 800000, end = 801000))

params <- ScanBamParam(which = gr, 
                       what = scanBamWhat())

aln <- scanBam(bamFile, param = params)
aln <- aln[[1]]  
aln$pos # 327 total

##   [1] 799975 799977 799979 799980 799986 799986 799989 799995 800000 800000
##  [11] 800001 800002 800009 800011 800014 800019 800021 800026 800030 800030
##  [21] 800033 800038 800038 800043 800045 800046 800047 800048 800048 800049
##  [31] 800049 800049 800052 800058 800060 800062 800068 800068 800068 800068
##  [41] 800068 800069 800078 800080 800084 800084 800085 800086 800095 800111
##  [51] 800112 800121 800125 800126 800129 800130 800133 800134 800161 800161
##  [61] 800166 800173 800175 800179 800182 800186 800191 800196 800197 800199
##  [71] 800202 800203 800205 800208 800210 800211 800214 800217 800220 800221
##  [81] 800222 800222 800222 800223 800225 800226 800227 800228 800228 800228
##  [91] 800243 800251 800253 800255 800256 800256 800259 800264 800272 800272
## [101] 800286 800294 800296 800302 800304 800306 800312 800320 800344 800346
## [111] 800348 800372 800373 800381 800382 800382 800397 800398 800398 800400
## [121] 800400 800400 800400 800408 800411 800413 800429 800431 800434 800434
## [131] 800435 800436 800440 800455 800459 800467 800471 800471 800477 800478
## [141] 800479 800479 800479 800480 800484 800484 800488 800489 800489 800489
## [151] 800492 800493 800497 800504 800513 800516 800517 800518 800518 800519
## [161] 800521 800524 800532 800537 800541 800541 800541 800541 800541 800541
## [171] 800541 800541 800541 800544 800545 800545 800546 800546 800561 800565
## [181] 800566 800566 800569 800569 800571 800585 800591 800593 800600 800628
## [191] 800631 800640 800642 800642 800652 800687 800689 800689 800695 800704
## [201] 800704 800706 800712 800714 800720 800722 800724 800730 800735 800736
## [211] 800745 800748 800748 800751 800753 800757 800761 800761 800763 800765
## [221] 800770 800774 800775 800776 800778 800778 800781 800788 800788 800788
## [231] 800792 800800 800816 800822 800827 800828 800832 800833 800833 800833
## [241] 800837 800839 800844 800844 800845 800845 800846 800847 800847 800853
## [251] 800857 800857 800857 800858 800859 800862 800864 800866 800870 800872
## [261] 800872 800872 800872 800873 800877 800878 800880 800886 800887 800890
## [271] 800892 800897 800897 800898 800899 800903 800904 800907 800907 800909
## [281] 800909 800909 800913 800913 800913 800913 800913 800914 800915 800916
## [291] 800917 800919 800922 800925 800926 800930 800933 800940 800940 800941
## [301] 800942 800945 800945 800945 800945 800945 800945 800948 800950 800955
## [311] 800957 800962 800964 800969 800972 800973 800973 800973 800978 800980
## [321] 800981 800982 800983 800983 800987 800989 800989

duplicatedValues = unique(aln$pos[duplicated(aln$pos)]) # duplicated positions and their names

sum(aln$pos %in% duplicatedValues)

## [1] 129

Q4: What is the average number of reads across the 8 samples falling in this interval?

library(GenomicRanges)

bpaths <- list.files(system.file("bam", package = "leeBamViews"), 
                     pattern = "bam$", 
                     full=TRUE)
gr <- GRanges(seqnames = "Scchr13", 
              ranges = IRanges(start = 807762, end = 808068))
# Scchr13:807762-808068

## ----BamViews------------------------------------------------------------
bamView <- BamViews(bpaths)
bamRanges(bamView) <- gr
aln <- scanBam(bamView)
names(aln)

## [1] "isowt5_13e.bam" "isowt6_13e.bam" "rlp5_13e.bam"   "rlp6_13e.bam"  
## [5] "ssr1_13e.bam"   "ssr2_13e.bam"   "xrn1_13e.bam"   "xrn2_13e.bam"

names(aln[[1]])

## [1] "Scchr13:807762-808068"

lens <- list()
for(i in 1:length(aln)) {
  lens[i] <- length(aln[[i]][[1]]$seq)
}

mean(unlist(lens))

## [1] 90.25

Q5: What is the average expression across samples in the control group for the “8149273” probeset (this is a character identifier, not a row number).

library(oligo) # For affy and nimblegen arrays geneExp and snp arrays

## Loading required package: oligoClasses

## Welcome to oligoClasses version 1.34.0

## 
## Attaching package: 'oligoClasses'

## The following objects are masked from 'package:ShortRead':
## 
##     chromosome, position

## ===========================================================================

## Welcome to oligo version 1.36.1

## ===========================================================================

## 
## Attaching package: 'oligo'

## The following object is masked from 'package:ShortRead':
## 
##     intensity

library(GEOquery)

## Setting options('download.file.method.GEOquery'='auto')

## Setting options('GEOquery.inmemory.gpl'=FALSE)

geoMat <- getGEO("GSE38792")

## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE38nnn/GSE38792/matrix/

## Found 1 file(s)

## GSE38792_series_matrix.txt.gz

## File stored at:

## /tmp/RtmpayfO9G/GPL6244.soft

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

pD.all <- pData(geoMat[[1]])

getGEOSuppFiles("GSE38792") # remember, raw data is in supplementary

## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE38nnn/GSE38792/suppl/

# list.files("GSE38792")

untar("GSE38792/GSE38792_RAW.tar", exdir = "GSE38792/CEL")
list.files("GSE38792/CEL") # samples from control to samples with sleep apnea

##  [1] "GSM949164_Control1.CEL.gz" "GSM949166_Control2.CEL.gz"
##  [3] "GSM949168_Control3.CEL.gz" "GSM949169_Control4.CEL.gz"
##  [5] "GSM949170_Control5.CEL.gz" "GSM949171_Control6.CEL.gz"
##  [7] "GSM949172_Control7.CEL.gz" "GSM949173_Control8.CEL.gz"
##  [9] "GSM949174_OSA1.CEL.gz"     "GSM949175_OSA2.CEL.gz"    
## [11] "GSM949176_OSA3.CEL.gz"     "GSM949177_OSA4.CEL.gz"    
## [13] "GSM949178_OSA5.CEL.gz"     "GSM949179_OSA6.CEL.gz"    
## [15] "GSM949180_OSA7.CEL.gz"     "GSM949181_OSA8.CEL.gz"    
## [17] "GSM949182_OSA9.CEL.gz"     "GSM949183_OSA10.CEL.gz"

celfiles <- list.files("GSE38792/CEL", full = TRUE)

rawData <- read.celfiles(celfiles)

## Loading required package: pd.hugene.1.0.st.v1

## Loading required package: RSQLite

## Loading required package: DBI

## Platform design info loaded.

## Reading in : GSE38792/CEL/GSM949164_Control1.CEL.gz
## Reading in : GSE38792/CEL/GSM949166_Control2.CEL.gz
## Reading in : GSE38792/CEL/GSM949168_Control3.CEL.gz
## Reading in : GSE38792/CEL/GSM949169_Control4.CEL.gz
## Reading in : GSE38792/CEL/GSM949170_Control5.CEL.gz
## Reading in : GSE38792/CEL/GSM949171_Control6.CEL.gz
## Reading in : GSE38792/CEL/GSM949172_Control7.CEL.gz
## Reading in : GSE38792/CEL/GSM949173_Control8.CEL.gz
## Reading in : GSE38792/CEL/GSM949174_OSA1.CEL.gz
## Reading in : GSE38792/CEL/GSM949175_OSA2.CEL.gz
## Reading in : GSE38792/CEL/GSM949176_OSA3.CEL.gz
## Reading in : GSE38792/CEL/GSM949177_OSA4.CEL.gz
## Reading in : GSE38792/CEL/GSM949178_OSA5.CEL.gz
## Reading in : GSE38792/CEL/GSM949179_OSA6.CEL.gz
## Reading in : GSE38792/CEL/GSM949180_OSA7.CEL.gz
## Reading in : GSE38792/CEL/GSM949181_OSA8.CEL.gz
## Reading in : GSE38792/CEL/GSM949182_OSA9.CEL.gz
## Reading in : GSE38792/CEL/GSM949183_OSA10.CEL.gz

# rawData  # pd.hugene.1.0.st.v1  human gene vs 1 based on random priming

filename <- sampleNames(rawData)
pData(rawData)$filename <- filename
sampleNames <- sub(".*_", "", filename)
sampleNames <- sub(".CEL.gz$", "", sampleNames)
sampleNames(rawData) <- sampleNames

pData(rawData)$group <- ifelse(grepl("^OSA", sampleNames(rawData)), "OSA", "Control")
pData(rawData)

##          index                  filename   group
## Control1     1 GSM949164_Control1.CEL.gz Control
## Control2     2 GSM949166_Control2.CEL.gz Control
## Control3     3 GSM949168_Control3.CEL.gz Control
## Control4     4 GSM949169_Control4.CEL.gz Control
## Control5     5 GSM949170_Control5.CEL.gz Control
## Control6     6 GSM949171_Control6.CEL.gz Control
## Control7     7 GSM949172_Control7.CEL.gz Control
## Control8     8 GSM949173_Control8.CEL.gz Control
## OSA1         9     GSM949174_OSA1.CEL.gz     OSA
## OSA2        10     GSM949175_OSA2.CEL.gz     OSA
## OSA3        11     GSM949176_OSA3.CEL.gz     OSA
## OSA4        12     GSM949177_OSA4.CEL.gz     OSA
## OSA5        13     GSM949178_OSA5.CEL.gz     OSA
## OSA6        14     GSM949179_OSA6.CEL.gz     OSA
## OSA7        15     GSM949180_OSA7.CEL.gz     OSA
## OSA8        16     GSM949181_OSA8.CEL.gz     OSA
## OSA9        17     GSM949182_OSA9.CEL.gz     OSA
## OSA10       18    GSM949183_OSA10.CEL.gz     OSA

normData <- rma(rawData)

## Background correcting
## Normalizing
## Calculating Expression

expr <- exprs(normData)
mean(expr["8149273", 1:8])

## [1] 7.02183

Q6: What is the absolute value of the log foldchange (log2 FC) of the gene with the lowest P.value?

library(limma)

## 
## Attaching package: 'limma'

## The following object is masked from 'package:oligo':
## 
##     backgroundCorrect

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

design <- model.matrix(~ normData$group)

fit <- lmFit(normData, design)
fit <- eBayes(fit)
topTable(fit)

## Removing intercept from test coefficients

##              logFC  AveExpr         t      P.Value adj.P.Val          B
## 8139843 -0.7126484 4.295282 -5.274246 3.979666e-05 0.4345455 -0.3661655
## 8175440 -0.5082288 5.382745 -4.593295 1.867479e-04 0.4345455 -1.0396815
## 8159963 -0.6602394 5.126963 -4.559928 2.016165e-04 0.4345455 -1.0744184
## 8148261 -0.8195649 5.849042 -4.446016 2.619936e-04 0.4345455 -1.1941456
## 8144512 -0.4837200 4.082467 -4.376250 3.076745e-04 0.4345455 -1.2683245
## 7893624 -1.0586901 4.867433 -4.280339 3.838608e-04 0.4345455 -1.3713170
## 7951807 -0.4578556 7.024299 -4.243910 4.175403e-04 0.4345455 -1.4107343
## 8173930 -0.4841767 5.118181 -4.226729 4.344397e-04 0.4345455 -1.4293806
## 7970082 -0.3744799 7.493430 -4.179494 4.845235e-04 0.4345455 -1.4808233
## 8057506  0.8047236 7.282087  4.098584 5.841428e-04 0.4345455 -1.5695411

abs(topTable(fit, n = 1)$logFC)

## Removing intercept from test coefficients

## [1] 0.7126484

Q7: How many genes are differentially expressed between the two groups at an adj.P.value cutoff of 0.05?

topTable(fit, p.value = 0.05)

## Removing intercept from test coefficients

## data frame with 0 columns and 0 rows

Q8: What is the mean difference in beta values between the 3 normal samples and the 3 cancer samples, across OpenSea CpGs?

library(minfi)

## Loading required package: lattice

## Loading required package: bumphunter

## Loading required package: foreach

## Parallel computing support for 'oligo/crlmm':

## Disabled

##      - Load 'ff'

##      - Load and register a 'foreach' adaptor

##         Example - Using 'multicore' for 2 cores:

##              library(doMC)

##              registerDoMC(2)

## ===========================================================================

## Loading required package: iterators

## Loading required package: locfit

## locfit 1.5-9.1    2013-03-22

## 
## Attaching package: 'locfit'

## The following objects are masked from 'package:GenomicAlignments':
## 
##     left, right

## 
## Attaching package: 'minfi'

## The following object is masked from 'package:oligo':
## 
##     getProbeInfo

## The following object is masked from 'package:oligoClasses':
## 
##     getM

require(minfiData)

## Loading required package: minfiData

## Loading required package: IlluminaHumanMethylation450kmanifest

## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19

data(RGsetEx)

p <- preprocessFunnorm(RGsetEx)

## [preprocessFunnorm] Background and dye bias correction with noob 
## [preprocessNoob] Using sample number 2 as reference level...
## [preprocessFunnorm] Mapping to genome
## [preprocessFunnorm] Quantile extraction
## [preprocessFunnorm] Normalization

b <- getBeta(p)
is <- getIslandStatus(p)
pData(p)$status

## [1] "normal" "normal" "cancer" "cancer" "normal" "cancer"

norm <- b[, c(1, 2, 5)]
can <- b[, c(3, 4, 6)]
norm_os <- norm[is == "OpenSea", ]
can_os <- can[is == "OpenSea", ]
mean(norm_os) - mean(can_os)

## [1] 0.08462416

Q9: How many of these DNase hypersensitive sites contain one or more CpGs on the 450k array?

library(AnnotationHub)

## 
## Attaching package: 'AnnotationHub'

## The following object is masked from 'package:Biobase':
## 
##     cache

ah <- AnnotationHub()

## snapshotDate(): 2016-10-11

qah_h1 <- query(ah, c("Caco2", "AWG"))
h <- qah_h1[["AH22442"]]

## loading from cache '/home/mcc/.AnnotationHub/27870'

sum(countOverlaps(p, h))

## [1] 68183

ah_s <- subset(ah, genome == "hg19")
ah_s <- subset(ah, dataprovider == "UCSC")
write.csv(mcols(ah), "ah.csv")
g <- ah[["AH5018"]] # assembly

## loading from cache '/home/mcc/.AnnotationHub/5018'

cpg <- ah_s[["AH5086"]] # CpG islands

## loading from cache '/home/mcc/.AnnotationHub/5086'

h_cpg <- subsetByOverlaps(cpg, h)
ov <- subsetByOverlaps(h_cpg, p)

ov

## GRanges object with 21652 ranges and 1 metadata column:
##           seqnames               ranges strand |        name
##              <Rle>            <IRanges>  <Rle> | <character>
##       [1]     chr1     [713985, 714547]      * |     CpG:_60
##       [2]     chr1     [762417, 763445]      * |    CpG:_115
##       [3]     chr1     [805199, 805628]      * |     CpG:_50
##       [4]     chr1     [839695, 840619]      * |     CpG:_83
##       [5]     chr1     [854766, 854973]      * |     CpG:_16
##       ...      ...                  ...    ... .         ...
##   [21648]    chr22 [51135671, 51136118]      * |     CpG:_44
##   [21649]    chr22 [51142803, 51143308]      * |     CpG:_38
##   [21650]    chr22 [51158387, 51160060]      * |    CpG:_167
##   [21651]    chr22 [51169028, 51170019]      * |     CpG:_81
##   [21652]    chr22 [51221773, 51222317]      * |     CpG:_63
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

Q10: How many features are differentially expressed between control and treatment (ie. padj <= 0.05)?

# source("https://bioconductor.org/biocLite.R")
# biocLite("DESeq2")

# source("https://bioconductor.org/biocLite.R")
# biocLite("zebrafishRNASeq")

library(DESeq2)
library(zebrafishRNASeq)

data(zfGenes)

#exclude spike-in controls
tx <- zfGenes[grep("^ERCC", 
                   rownames(zfGenes), 
                   invert = T), ]

counts_mat <- as.matrix(tx)
colData <- DataFrame(sampleID = colnames(tx), 
                     group = as.factor(c("control", "control", "control", "treatment", "treatment", "treatment")))

ddsMat <- DESeqDataSetFromMatrix(counts_mat, colData, design = ~ group)
ddsMat <- DESeq(ddsMat)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

res <- results(ddsMat)
res <- res[order(res$padj), ]
sigRes <- subset(res, padj <= 0.05)
dim(sigRes)

## [1] 72  6

MCC Bioconductor Project 4

10/23/16

Q1: What fraction of reads in this file has an ‘A’ nucleotide in the 5th base of the read?

Q2: What is the average numeric quality value of the 5th base of these reads?

Q3: In this interval, how many reads are duplicated by position?

Q4: What is the average number of reads across the 8 samples falling in this interval?

Q5: What is the average expression across samples in the control group for the “8149273” probeset (this is a character identifier, not a row number).

Q6: What is the absolute value of the log foldchange (log2 FC) of the gene with the lowest P.value?

Q7: How many genes are differentially expressed between the two groups at an adj.P.value cutoff of 0.05?

Q8: What is the mean difference in beta values between the 3 normal samples and the 3 cancer samples, across OpenSea CpGs?

Q9: How many of these DNase hypersensitive sites contain one or more CpGs on the 450k array?

Q10: How many features are differentially expressed between control and treatment (ie. padj <= 0.05)?