Q1: What fraction of reads in this file has an ‘A’ nucleotide in the 5th base of the read?
# source("https://bioconductor.org/biocLite.R")
# biocLite("yeastRNASeq")
library(yeastRNASeq)
library(ShortRead)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
fastqFilePath <- system.file("reads",
"wt_1_f.fastq.gz",
package = "yeastRNASeq")
fqFile <- FastqFile(fastqFilePath)
reads <- readFastq(fqFile)
reads_set <- sread(reads)
sum(DNAStringSet(reads_set, 5, 5) == "A") / length(reads_set)
## [1] 0.363841
Q2: What is the average numeric quality value of the 5th base of these reads?
qm <- as(quality(reads), "matrix")
mean(qm[, 5:5])
## [1] 28.93346
Q3: In this interval, how many reads are duplicated by position?
# source("https://bioconductor.org/biocLite.R")
# biocLite("leeBamViews")
library(leeBamViews)
## Loading required package: BSgenome
## Loading required package: rtracklayer
bamFilePath <- system.file("bam",
"isowt5_13e.bam",
package="leeBamViews")
# In this interval, how many reads are duplicated by position?
bamFile <- BamFile(bamFilePath)
seqinfo(bamFile)
## Seqinfo object with 17 sequences from an unspecified genome:
## seqnames seqlengths isCircular genome
## Scchr01 230208 <NA> <NA>
## Scchr02 813178 <NA> <NA>
## Scchr03 316617 <NA> <NA>
## Scchr04 1531919 <NA> <NA>
## Scchr05 576869 <NA> <NA>
## ... ... ... ...
## Scchr13 924429 <NA> <NA>
## Scchr14 784334 <NA> <NA>
## Scchr15 1091289 <NA> <NA>
## Scchr16 948062 <NA> <NA>
## Scmito 85779 <NA> <NA>
aln <- scanBam(bamFile)
aln <- aln[[1]]
names(aln)
## [1] "qname" "flag" "rname" "strand" "pos" "qwidth" "mapq"
## [8] "cigar" "mrnm" "mpos" "isize" "seq" "qual"
lapply(aln, function(xx) xx[1])
## $qname
## [1] "GA-EAS46_1_209DH:5:14:206:926"
##
## $flag
## [1] 16
##
## $rname
## [1] Scchr13
## 17 Levels: Scchr01 Scchr02 Scchr03 Scchr04 Scchr05 Scchr06 ... Scmito
##
## $strand
## [1] -
## Levels: + - *
##
## $pos
## [1] 799975
##
## $qwidth
## [1] 36
##
## $mapq
## [1] 25
##
## $cigar
## [1] "36M"
##
## $mrnm
## [1] <NA>
## 17 Levels: Scchr01 Scchr02 Scchr03 Scchr04 Scchr05 Scchr06 ... Scmito
##
## $mpos
## [1] 0
##
## $isize
## [1] 0
##
## $seq
## A DNAStringSet instance of length 1
## width seq
## [1] 36 GTTATCCAAGTCTTTTTAGTTACCACTCTATCCTCC
##
## $qual
## A PhredQuality instance of length 1
## width seq
## [1] 36 JONSHWfURPJUgXcadYKhfUhh^hhhhhhhh^hh
unique(aln$rname)
## [1] Scchr13
## 17 Levels: Scchr01 Scchr02 Scchr03 Scchr04 Scchr05 Scchr06 ... Scmito
gr <- GRanges(seqnames = "Scchr13",
ranges = IRanges(start = 800000, end = 801000))
params <- ScanBamParam(which = gr,
what = scanBamWhat())
aln <- scanBam(bamFile, param = params)
aln <- aln[[1]]
aln$pos # 327 total
## [1] 799975 799977 799979 799980 799986 799986 799989 799995 800000 800000
## [11] 800001 800002 800009 800011 800014 800019 800021 800026 800030 800030
## [21] 800033 800038 800038 800043 800045 800046 800047 800048 800048 800049
## [31] 800049 800049 800052 800058 800060 800062 800068 800068 800068 800068
## [41] 800068 800069 800078 800080 800084 800084 800085 800086 800095 800111
## [51] 800112 800121 800125 800126 800129 800130 800133 800134 800161 800161
## [61] 800166 800173 800175 800179 800182 800186 800191 800196 800197 800199
## [71] 800202 800203 800205 800208 800210 800211 800214 800217 800220 800221
## [81] 800222 800222 800222 800223 800225 800226 800227 800228 800228 800228
## [91] 800243 800251 800253 800255 800256 800256 800259 800264 800272 800272
## [101] 800286 800294 800296 800302 800304 800306 800312 800320 800344 800346
## [111] 800348 800372 800373 800381 800382 800382 800397 800398 800398 800400
## [121] 800400 800400 800400 800408 800411 800413 800429 800431 800434 800434
## [131] 800435 800436 800440 800455 800459 800467 800471 800471 800477 800478
## [141] 800479 800479 800479 800480 800484 800484 800488 800489 800489 800489
## [151] 800492 800493 800497 800504 800513 800516 800517 800518 800518 800519
## [161] 800521 800524 800532 800537 800541 800541 800541 800541 800541 800541
## [171] 800541 800541 800541 800544 800545 800545 800546 800546 800561 800565
## [181] 800566 800566 800569 800569 800571 800585 800591 800593 800600 800628
## [191] 800631 800640 800642 800642 800652 800687 800689 800689 800695 800704
## [201] 800704 800706 800712 800714 800720 800722 800724 800730 800735 800736
## [211] 800745 800748 800748 800751 800753 800757 800761 800761 800763 800765
## [221] 800770 800774 800775 800776 800778 800778 800781 800788 800788 800788
## [231] 800792 800800 800816 800822 800827 800828 800832 800833 800833 800833
## [241] 800837 800839 800844 800844 800845 800845 800846 800847 800847 800853
## [251] 800857 800857 800857 800858 800859 800862 800864 800866 800870 800872
## [261] 800872 800872 800872 800873 800877 800878 800880 800886 800887 800890
## [271] 800892 800897 800897 800898 800899 800903 800904 800907 800907 800909
## [281] 800909 800909 800913 800913 800913 800913 800913 800914 800915 800916
## [291] 800917 800919 800922 800925 800926 800930 800933 800940 800940 800941
## [301] 800942 800945 800945 800945 800945 800945 800945 800948 800950 800955
## [311] 800957 800962 800964 800969 800972 800973 800973 800973 800978 800980
## [321] 800981 800982 800983 800983 800987 800989 800989
duplicatedValues = unique(aln$pos[duplicated(aln$pos)]) # duplicated positions and their names
sum(aln$pos %in% duplicatedValues)
## [1] 129
Q4: What is the average number of reads across the 8 samples falling in this interval?
library(GenomicRanges)
bpaths <- list.files(system.file("bam", package = "leeBamViews"),
pattern = "bam$",
full=TRUE)
gr <- GRanges(seqnames = "Scchr13",
ranges = IRanges(start = 807762, end = 808068))
# Scchr13:807762-808068
## ----BamViews------------------------------------------------------------
bamView <- BamViews(bpaths)
bamRanges(bamView) <- gr
aln <- scanBam(bamView)
names(aln)
## [1] "isowt5_13e.bam" "isowt6_13e.bam" "rlp5_13e.bam" "rlp6_13e.bam"
## [5] "ssr1_13e.bam" "ssr2_13e.bam" "xrn1_13e.bam" "xrn2_13e.bam"
names(aln[[1]])
## [1] "Scchr13:807762-808068"
lens <- list()
for(i in 1:length(aln)) {
lens[i] <- length(aln[[i]][[1]]$seq)
}
mean(unlist(lens))
## [1] 90.25
Q5: What is the average expression across samples in the control group for the “8149273” probeset (this is a character identifier, not a row number).
library(oligo) # For affy and nimblegen arrays geneExp and snp arrays
## Loading required package: oligoClasses
## Welcome to oligoClasses version 1.34.0
##
## Attaching package: 'oligoClasses'
## The following objects are masked from 'package:ShortRead':
##
## chromosome, position
## ===========================================================================
## Welcome to oligo version 1.36.1
## ===========================================================================
##
## Attaching package: 'oligo'
## The following object is masked from 'package:ShortRead':
##
## intensity
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
geoMat <- getGEO("GSE38792")
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE38nnn/GSE38792/matrix/
## Found 1 file(s)
## GSE38792_series_matrix.txt.gz
## File stored at:
## /tmp/RtmpayfO9G/GPL6244.soft
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist
pD.all <- pData(geoMat[[1]])
getGEOSuppFiles("GSE38792") # remember, raw data is in supplementary
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE38nnn/GSE38792/suppl/
# list.files("GSE38792")
untar("GSE38792/GSE38792_RAW.tar", exdir = "GSE38792/CEL")
list.files("GSE38792/CEL") # samples from control to samples with sleep apnea
## [1] "GSM949164_Control1.CEL.gz" "GSM949166_Control2.CEL.gz"
## [3] "GSM949168_Control3.CEL.gz" "GSM949169_Control4.CEL.gz"
## [5] "GSM949170_Control5.CEL.gz" "GSM949171_Control6.CEL.gz"
## [7] "GSM949172_Control7.CEL.gz" "GSM949173_Control8.CEL.gz"
## [9] "GSM949174_OSA1.CEL.gz" "GSM949175_OSA2.CEL.gz"
## [11] "GSM949176_OSA3.CEL.gz" "GSM949177_OSA4.CEL.gz"
## [13] "GSM949178_OSA5.CEL.gz" "GSM949179_OSA6.CEL.gz"
## [15] "GSM949180_OSA7.CEL.gz" "GSM949181_OSA8.CEL.gz"
## [17] "GSM949182_OSA9.CEL.gz" "GSM949183_OSA10.CEL.gz"
celfiles <- list.files("GSE38792/CEL", full = TRUE)
rawData <- read.celfiles(celfiles)
## Loading required package: pd.hugene.1.0.st.v1
## Loading required package: RSQLite
## Loading required package: DBI
## Platform design info loaded.
## Reading in : GSE38792/CEL/GSM949164_Control1.CEL.gz
## Reading in : GSE38792/CEL/GSM949166_Control2.CEL.gz
## Reading in : GSE38792/CEL/GSM949168_Control3.CEL.gz
## Reading in : GSE38792/CEL/GSM949169_Control4.CEL.gz
## Reading in : GSE38792/CEL/GSM949170_Control5.CEL.gz
## Reading in : GSE38792/CEL/GSM949171_Control6.CEL.gz
## Reading in : GSE38792/CEL/GSM949172_Control7.CEL.gz
## Reading in : GSE38792/CEL/GSM949173_Control8.CEL.gz
## Reading in : GSE38792/CEL/GSM949174_OSA1.CEL.gz
## Reading in : GSE38792/CEL/GSM949175_OSA2.CEL.gz
## Reading in : GSE38792/CEL/GSM949176_OSA3.CEL.gz
## Reading in : GSE38792/CEL/GSM949177_OSA4.CEL.gz
## Reading in : GSE38792/CEL/GSM949178_OSA5.CEL.gz
## Reading in : GSE38792/CEL/GSM949179_OSA6.CEL.gz
## Reading in : GSE38792/CEL/GSM949180_OSA7.CEL.gz
## Reading in : GSE38792/CEL/GSM949181_OSA8.CEL.gz
## Reading in : GSE38792/CEL/GSM949182_OSA9.CEL.gz
## Reading in : GSE38792/CEL/GSM949183_OSA10.CEL.gz
# rawData # pd.hugene.1.0.st.v1 human gene vs 1 based on random priming
filename <- sampleNames(rawData)
pData(rawData)$filename <- filename
sampleNames <- sub(".*_", "", filename)
sampleNames <- sub(".CEL.gz$", "", sampleNames)
sampleNames(rawData) <- sampleNames
pData(rawData)$group <- ifelse(grepl("^OSA", sampleNames(rawData)), "OSA", "Control")
pData(rawData)
## index filename group
## Control1 1 GSM949164_Control1.CEL.gz Control
## Control2 2 GSM949166_Control2.CEL.gz Control
## Control3 3 GSM949168_Control3.CEL.gz Control
## Control4 4 GSM949169_Control4.CEL.gz Control
## Control5 5 GSM949170_Control5.CEL.gz Control
## Control6 6 GSM949171_Control6.CEL.gz Control
## Control7 7 GSM949172_Control7.CEL.gz Control
## Control8 8 GSM949173_Control8.CEL.gz Control
## OSA1 9 GSM949174_OSA1.CEL.gz OSA
## OSA2 10 GSM949175_OSA2.CEL.gz OSA
## OSA3 11 GSM949176_OSA3.CEL.gz OSA
## OSA4 12 GSM949177_OSA4.CEL.gz OSA
## OSA5 13 GSM949178_OSA5.CEL.gz OSA
## OSA6 14 GSM949179_OSA6.CEL.gz OSA
## OSA7 15 GSM949180_OSA7.CEL.gz OSA
## OSA8 16 GSM949181_OSA8.CEL.gz OSA
## OSA9 17 GSM949182_OSA9.CEL.gz OSA
## OSA10 18 GSM949183_OSA10.CEL.gz OSA
normData <- rma(rawData)
## Background correcting
## Normalizing
## Calculating Expression
expr <- exprs(normData)
mean(expr["8149273", 1:8])
## [1] 7.02183
Q6: What is the absolute value of the log foldchange (log2 FC) of the gene with the lowest P.value?
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:oligo':
##
## backgroundCorrect
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
design <- model.matrix(~ normData$group)
fit <- lmFit(normData, design)
fit <- eBayes(fit)
topTable(fit)
## Removing intercept from test coefficients
## logFC AveExpr t P.Value adj.P.Val B
## 8139843 -0.7126484 4.295282 -5.274246 3.979666e-05 0.4345455 -0.3661655
## 8175440 -0.5082288 5.382745 -4.593295 1.867479e-04 0.4345455 -1.0396815
## 8159963 -0.6602394 5.126963 -4.559928 2.016165e-04 0.4345455 -1.0744184
## 8148261 -0.8195649 5.849042 -4.446016 2.619936e-04 0.4345455 -1.1941456
## 8144512 -0.4837200 4.082467 -4.376250 3.076745e-04 0.4345455 -1.2683245
## 7893624 -1.0586901 4.867433 -4.280339 3.838608e-04 0.4345455 -1.3713170
## 7951807 -0.4578556 7.024299 -4.243910 4.175403e-04 0.4345455 -1.4107343
## 8173930 -0.4841767 5.118181 -4.226729 4.344397e-04 0.4345455 -1.4293806
## 7970082 -0.3744799 7.493430 -4.179494 4.845235e-04 0.4345455 -1.4808233
## 8057506 0.8047236 7.282087 4.098584 5.841428e-04 0.4345455 -1.5695411
abs(topTable(fit, n = 1)$logFC)
## Removing intercept from test coefficients
## [1] 0.7126484
Q7: How many genes are differentially expressed between the two groups at an adj.P.value cutoff of 0.05?
topTable(fit, p.value = 0.05)
## Removing intercept from test coefficients
## data frame with 0 columns and 0 rows
Q8: What is the mean difference in beta values between the 3 normal samples and the 3 cancer samples, across OpenSea CpGs?
library(minfi)
## Loading required package: lattice
## Loading required package: bumphunter
## Loading required package: foreach
## Parallel computing support for 'oligo/crlmm':
## Disabled
## - Load 'ff'
## - Load and register a 'foreach' adaptor
## Example - Using 'multicore' for 2 cores:
## library(doMC)
## registerDoMC(2)
## ===========================================================================
## Loading required package: iterators
## Loading required package: locfit
## locfit 1.5-9.1 2013-03-22
##
## Attaching package: 'locfit'
## The following objects are masked from 'package:GenomicAlignments':
##
## left, right
##
## Attaching package: 'minfi'
## The following object is masked from 'package:oligo':
##
## getProbeInfo
## The following object is masked from 'package:oligoClasses':
##
## getM
require(minfiData)
## Loading required package: minfiData
## Loading required package: IlluminaHumanMethylation450kmanifest
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
data(RGsetEx)
p <- preprocessFunnorm(RGsetEx)
## [preprocessFunnorm] Background and dye bias correction with noob
## [preprocessNoob] Using sample number 2 as reference level...
## [preprocessFunnorm] Mapping to genome
## [preprocessFunnorm] Quantile extraction
## [preprocessFunnorm] Normalization
b <- getBeta(p)
is <- getIslandStatus(p)
pData(p)$status
## [1] "normal" "normal" "cancer" "cancer" "normal" "cancer"
norm <- b[, c(1, 2, 5)]
can <- b[, c(3, 4, 6)]
norm_os <- norm[is == "OpenSea", ]
can_os <- can[is == "OpenSea", ]
mean(norm_os) - mean(can_os)
## [1] 0.08462416
Q9: How many of these DNase hypersensitive sites contain one or more CpGs on the 450k array?
library(AnnotationHub)
##
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:Biobase':
##
## cache
ah <- AnnotationHub()
## snapshotDate(): 2016-10-11
qah_h1 <- query(ah, c("Caco2", "AWG"))
h <- qah_h1[["AH22442"]]
## loading from cache '/home/mcc/.AnnotationHub/27870'
sum(countOverlaps(p, h))
## [1] 68183
ah_s <- subset(ah, genome == "hg19")
ah_s <- subset(ah, dataprovider == "UCSC")
write.csv(mcols(ah), "ah.csv")
g <- ah[["AH5018"]] # assembly
## loading from cache '/home/mcc/.AnnotationHub/5018'
cpg <- ah_s[["AH5086"]] # CpG islands
## loading from cache '/home/mcc/.AnnotationHub/5086'
h_cpg <- subsetByOverlaps(cpg, h)
ov <- subsetByOverlaps(h_cpg, p)
ov
## GRanges object with 21652 ranges and 1 metadata column:
## seqnames ranges strand | name
## <Rle> <IRanges> <Rle> | <character>
## [1] chr1 [713985, 714547] * | CpG:_60
## [2] chr1 [762417, 763445] * | CpG:_115
## [3] chr1 [805199, 805628] * | CpG:_50
## [4] chr1 [839695, 840619] * | CpG:_83
## [5] chr1 [854766, 854973] * | CpG:_16
## ... ... ... ... . ...
## [21648] chr22 [51135671, 51136118] * | CpG:_44
## [21649] chr22 [51142803, 51143308] * | CpG:_38
## [21650] chr22 [51158387, 51160060] * | CpG:_167
## [21651] chr22 [51169028, 51170019] * | CpG:_81
## [21652] chr22 [51221773, 51222317] * | CpG:_63
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Q10: How many features are differentially expressed between control and treatment (ie. padj <= 0.05)?
# source("https://bioconductor.org/biocLite.R")
# biocLite("DESeq2")
# source("https://bioconductor.org/biocLite.R")
# biocLite("zebrafishRNASeq")
library(DESeq2)
library(zebrafishRNASeq)
data(zfGenes)
#exclude spike-in controls
tx <- zfGenes[grep("^ERCC",
rownames(zfGenes),
invert = T), ]
counts_mat <- as.matrix(tx)
colData <- DataFrame(sampleID = colnames(tx),
group = as.factor(c("control", "control", "control", "treatment", "treatment", "treatment")))
ddsMat <- DESeqDataSetFromMatrix(counts_mat, colData, design = ~ group)
ddsMat <- DESeq(ddsMat)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(ddsMat)
res <- res[order(res$padj), ]
sigRes <- subset(res, padj <= 0.05)
dim(sigRes)
## [1] 72 6