# source("https://bioconductor.org/biocLite.R")
# biocLite("ALL")
# browseVignettes("ALL")

# source("https://bioconductor.org/biocLite.R")
# biocLite("hgu95av2.db")

# install.packages(dplyr)

# source("https://bioconductor.org/biocLite.R")
# biocLite("airway")

# source("https://bioconductor.org/biocLite.R")
# biocLite("mixOmics")
# biocLite("doing")
# biocLite("minfiData")
# biocLite("biomaRt")
# biocLite("AnnotationHub")TxDb.Hsapiens.UCSC.hg19.knownGene
# biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")

# This little gem updates Bioconductor ONLY if R is also up to date.
# source("http://bioconductor.org/biocLite.R")
# biocLite()

load packages

packages <- function(pkg){
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg))
        install.packages(new.pkg, dependencies = TRUE,
                         repos='http://cran.rstudio.com/')
    sapply(pkg, require, character.only = TRUE)
}

packages(c("GEOquery", "Biobase", "ALL", "hgu95av2.db", "GenomicRanges", "airway", "biomaRt", "dplyr", "minfiData", "TxDb.Hsapiens.UCSC.hg19.knownGene", "AnnotationHub"))

## Loading required package: GEOquery

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Setting options('download.file.method.GEOquery'='auto')

## Setting options('GEOquery.inmemory.gpl'=FALSE)

## Loading required package: ALL

## Loading required package: hgu95av2.db

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums

## Loading required package: org.Hs.eg.db

##

##

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: airway

## Loading required package: SummarizedExperiment

## Loading required package: biomaRt

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:biomaRt':
## 
##     select

## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union

## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect

## The following object is masked from 'package:AnnotationDbi':
## 
##     select

## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, regroup, setdiff, slice, union

## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: minfiData

## Loading required package: minfi

## Loading required package: lattice

## Loading required package: Biostrings

## Loading required package: XVector

## Loading required package: bumphunter

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: locfit

## locfit 1.5-9.1    2013-03-22

## Loading required package: IlluminaHumanMethylation450kmanifest

## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19

## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene

## Loading required package: GenomicFeatures

## Loading required package: AnnotationHub

## 
## Attaching package: 'AnnotationHub'

## The following object is masked from 'package:dplyr':
## 
##     query

## The following object is masked from 'package:Biobase':
## 
##     cache

##                          GEOquery                           Biobase 
##                              TRUE                              TRUE 
##                               ALL                       hgu95av2.db 
##                              TRUE                              TRUE 
##                     GenomicRanges                            airway 
##                              TRUE                              TRUE 
##                           biomaRt                             dplyr 
##                              TRUE                              TRUE 
##                         minfiData TxDb.Hsapiens.UCSC.hg19.knownGene 
##                              TRUE                              TRUE 
##                     AnnotationHub 
##                              TRUE

Q1: What is the mean expression across all features for sample 5 in the ‘ALL’ dataset?

data(ALL)
sample5 <- ALL[,5]

# find mean expression for sample5
mean(exprs(sample5))

## [1] 5.629627

Q2: Using Ensembl, annotate each feature of the ‘ALL’ dataset with the Ensembl gene id. How many probesets (features) are annotated with more than one Ensembl gene id?

# connect to Ensembl
mart <- useMart(host = 'feb2014.archive.ensembl.org', 
                biomart = "ENSEMBL_MART_ENSEMBL")
ensembl <- useDataset("hsapiens_gene_ensembl", mart)
names <- featureNames(ALL)
# find affymatrix attributes
attrs <- listAttributes(ensembl, page = "feature_page")
# return results
result <- getBM(attributes = c("affy_hg_u95av2", 
                               "ensembl_gene_id", 
                               "chromosome_name"),
                filters = "affy_hg_u95av2", 
                values = names, 
                mart = ensembl)
tail(result)

##       affy_hg_u95av2 ensembl_gene_id    chromosome_name
## 13419     34906_g_at ENSG00000272706 HG1350_HG959_PATCH
## 13420       32208_at ENSG00000262924   HSCHR19_1_CTG3_1
## 13421       41220_at ENSG00000261843     HSCHR17_3_CTG4
## 13422        1436_at ENSG00000271903     HSCHR22_2_CTG1
## 13423       35778_at ENSG00000262907     HSCHR2_2_CTG12
## 13424     39086_g_at ENSG00000262771      HSCHR7_1_CTG6

# group result by affy_hg_u95av2
prob_set <- result %>%
  group_by (affy_hg_u95av2) %>%
  summarise(prob_count = n())

sum(prob_set$prob_count > 1)

## [1] 1045

Q3: How many probesets (Affymetrix IDs) are annotated with one or more genes on the autosomes, chromosomes 1 to 22?

result_autosome <- subset(result, chromosome_name < 23)
prob_set_autosome <- result_autosome %>%
  group_by (affy_hg_u95av2) %>%
  summarise(prob_count = n())

#tail(result_autosome)

sum(prob_set_autosome$prob_count > 0)

## [1] 7494

Q4: What is the mean value of the Methylation channel across the features for sample ‘5723646052_R04C01’?

data(MsetEx)
#pData(MsetEx)  # 5723646052_R04C01
sample_2 <- MsetEx[,2]  # return a MethylSet for sample #2

mean(getMeth(sample_2)) # produce mean of MethylSet for sample #2

## [1] 7228.277

Q5: Access the processed data from NCBI GEO Accession number GSE788. What is the mean expression level of sample ‘GSM9024’?

# library(GEOquery)
query <- getGEO("GSE788")

## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSEnnn/GSE788/matrix/

## Found 1 file(s)

## GSE788_series_matrix.txt.gz

## File stored at:

## /tmp/Rtmpf7pu3s/GPL85.soft

## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist

data <- query[[1]]
#pData(data)
GSM9024 <- data[,2]

mean(exprs(GSM9024))

## [1] 756.432

Q6: What is the mean length across the samples in the experiment?

# library(airway)
data(airway)

mean(airway$avgLength)

## [1] 113.75

Q7: What is the number of Ensembl genes which have a count of >= 1 read in sample ‘SRR1039512’?

SRR1039512 <- airway[,3]
counts <- assay(SRR1039512, "counts")

sum(counts >= 1) # summation(subset counts which are greater than or equal to 1)

## [1] 25699

Q8: The airway dataset contains more than 64k features. How many of these features overlaps with transcripts on the autosomes (chromosomes 1-22) as represented by the ‘TxDb.Hsapiens.UCSC.hg19.knownGene’ package?

# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
exons <- exons(txdb) # extract exon info

autosome <- paste0("chr", c(1:22)) # autosomes = chr1:22
df <- data.frame(seqnames = autosome)
exons <- keepSeqlevels(exons, autosome) # use only autosomal exons
ncbiStyleLevels <- mapSeqlevels(seqlevels(exons),"NCBI")
exons <- renameSeqlevels(exons, ncbiStyleLevels)
subset <- subsetByOverlaps(airway, exons)

subset

## class: RangedSummarizedExperiment 
## dim: 26276 8 
## metadata(1): ''
## assays(1): counts
## rownames(26276): ENSG00000000419 ENSG00000000457 ...
##   ENSG00000273471 ENSG00000273477
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample

Q9: Consider sample ‘SRR1039508’, what probability (where: 0 < p < 1) of the total reads in the ‘airway’ dataset which are part of a feature that overlaps an autosomal transcript from ‘TxDb.Hsapiens.UCSC.hg19.knownGene’?

SRR1039508 <- airway[,1]
subset_SRR1039508 <- subsetByOverlaps(SRR1039508, exons)# use only autosomal exons
counts <- assay(SRR1039508, "counts")
subset_counts <- assay(subset_SRR1039508, "counts")

sum(subset_counts)/sum(counts) # probability (where: 0 < p < 1)

## [1] 0.9004193

Q10: What is the median number of counts per feature (for sample SRR1039508) which contain a H3K4me narrowPeak in their promoter (only features which overlap autosomal transcripts from ‘TxDb.Hsapiens.UCSC.hg19.knownGene’ should be considered)?

# library(AnnotationHub)
ah <- AnnotationHub()

## snapshotDate(): 2016-10-11

qah_h1 <- query(ah, c("E096", "H3K4me3"))
h1 <- qah_h1[["AH30596"]] # AH30596 | E096-H3K4me3.narrowPeak.gz

## require("rtracklayer")

## loading from cache '/home/mcc/.AnnotationHub/36036'

h1 <- keepSeqlevels(h1, autosome)
h1 <- renameSeqlevels(h1, ncbiStyleLevels)
t <- range(rowRanges(subset_SRR1039508))
ncbiByGroup <- extractSeqlevelsByGroup(species = "Homo sapiens", 
                                     style = "NCBI", 
                                     group = "auto")
t <- keepSeqlevels(t, ncbiByGroup)
p <- promoters(t)
ov <- subsetByOverlaps(p, h1)
t2 <- subsetByOverlaps(SRR1039508, ov)
counts <- assay(t2, "counts")

median(counts)

## [1] 205

sessionInfo()

## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rtracklayer_1.32.2                                
##  [2] AnnotationHub_2.4.2                               
##  [3] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2           
##  [4] GenomicFeatures_1.24.5                            
##  [5] minfiData_0.14.0                                  
##  [6] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.2.1
##  [7] IlluminaHumanMethylation450kmanifest_0.4.0        
##  [8] minfi_1.18.6                                      
##  [9] bumphunter_1.12.0                                 
## [10] locfit_1.5-9.1                                    
## [11] iterators_1.0.8                                   
## [12] foreach_1.4.3                                     
## [13] Biostrings_2.40.2                                 
## [14] XVector_0.12.1                                    
## [15] lattice_0.20-34                                   
## [16] dplyr_0.5.0                                       
## [17] biomaRt_2.28.0                                    
## [18] airway_0.106.2                                    
## [19] SummarizedExperiment_1.2.3                        
## [20] GenomicRanges_1.24.3                              
## [21] GenomeInfoDb_1.8.7                                
## [22] hgu95av2.db_3.2.3                                 
## [23] org.Hs.eg.db_3.3.0                                
## [24] AnnotationDbi_1.34.4                              
## [25] IRanges_2.6.1                                     
## [26] S4Vectors_0.10.3                                  
## [27] ALL_1.14.0                                        
## [28] GEOquery_2.38.4                                   
## [29] Biobase_2.32.0                                    
## [30] BiocGenerics_0.18.0                               
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.2.1                    nor1mix_1.2-2                
##  [3] splines_3.3.1                 shiny_0.14.1                 
##  [5] assertthat_0.1                interactiveDisplayBase_1.10.3
##  [7] doRNG_1.6                     Rsamtools_1.24.0             
##  [9] yaml_2.1.13                   RSQLite_1.0.0                
## [11] limma_3.28.21                 quadprog_1.5-5               
## [13] chron_2.3-47                  digest_0.6.10                
## [15] RColorBrewer_1.1-2            httpuv_1.3.3                 
## [17] htmltools_0.3.5               preprocessCore_1.34.0        
## [19] Matrix_1.2-7.1                plyr_1.8.4                   
## [21] siggenes_1.46.0               XML_3.98-1.4                 
## [23] genefilter_1.54.2             zlibbioc_1.18.0              
## [25] xtable_1.8-2                  BiocParallel_1.6.6           
## [27] tibble_1.2                    openssl_0.9.4                
## [29] annotate_1.50.1               beanplot_1.2                 
## [31] pkgmaker_0.22                 lazyeval_0.2.0               
## [33] mime_0.5                      survival_2.39-5              
## [35] magrittr_1.5                  mclust_5.2                   
## [37] evaluate_0.10                 nlme_3.1-128                 
## [39] MASS_7.3-45                   BiocInstaller_1.22.3         
## [41] data.table_1.9.6              tools_3.3.1                  
## [43] registry_0.3                  formatR_1.4                  
## [45] matrixStats_0.51.0            stringr_1.1.0                
## [47] rngtools_1.2.4                base64_2.0                   
## [49] grid_3.3.1                    RCurl_1.95-4.8               
## [51] bitops_1.0-6                  rmarkdown_1.0                
## [53] codetools_0.2-15              multtest_2.28.0              
## [55] curl_2.1                      DBI_0.5-1                    
## [57] reshape_0.8.5                 R6_2.2.0                     
## [59] illuminaio_0.14.0             GenomicAlignments_1.8.4      
## [61] knitr_1.14                    stringi_1.1.2                
## [63] Rcpp_0.12.7

Project 3: Expression Sets

10/15/16

load packages

Q1: What is the mean expression across all features for sample 5 in the ‘ALL’ dataset?

Q2: Using Ensembl, annotate each feature of the ‘ALL’ dataset with the Ensembl gene id. How many probesets (features) are annotated with more than one Ensembl gene id?

Q3: How many probesets (Affymetrix IDs) are annotated with one or more genes on the autosomes, chromosomes 1 to 22?

Q4: What is the mean value of the Methylation channel across the features for sample ‘5723646052_R04C01’?

Q5: Access the processed data from NCBI GEO Accession number GSE788. What is the mean expression level of sample ‘GSM9024’?

Q6: What is the mean length across the samples in the experiment?

Q7: What is the number of Ensembl genes which have a count of >= 1 read in sample ‘SRR1039512’?

Q8: The airway dataset contains more than 64k features. How many of these features overlaps with transcripts on the autosomes (chromosomes 1-22) as represented by the ‘TxDb.Hsapiens.UCSC.hg19.knownGene’ package?

Q9: Consider sample ‘SRR1039508’, what probability (where: 0 < p < 1) of the total reads in the ‘airway’ dataset which are part of a feature that overlaps an autosomal transcript from ‘TxDb.Hsapiens.UCSC.hg19.knownGene’?

Q10: What is the median number of counts per feature (for sample SRR1039508) which contain a H3K4me narrowPeak in their promoter (only features which overlap autosomal transcripts from ‘TxDb.Hsapiens.UCSC.hg19.knownGene’ should be considered)?