# source("https://bioconductor.org/biocLite.R")
# biocLite("ALL")
# browseVignettes("ALL")
# source("https://bioconductor.org/biocLite.R")
# biocLite("hgu95av2.db")
# install.packages(dplyr)
# source("https://bioconductor.org/biocLite.R")
# biocLite("airway")
# source("https://bioconductor.org/biocLite.R")
# biocLite("mixOmics")
# biocLite("doing")
# biocLite("minfiData")
# biocLite("biomaRt")
# biocLite("AnnotationHub")TxDb.Hsapiens.UCSC.hg19.knownGene
# biocLite("TxDb.Hsapiens.UCSC.hg19.knownGene")
# This little gem updates Bioconductor ONLY if R is also up to date.
# source("http://bioconductor.org/biocLite.R")
# biocLite()
packages <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE,
repos='http://cran.rstudio.com/')
sapply(pkg, require, character.only = TRUE)
}
packages(c("GEOquery", "Biobase", "ALL", "hgu95av2.db", "GenomicRanges", "airway", "biomaRt", "dplyr", "minfiData", "TxDb.Hsapiens.UCSC.hg19.knownGene", "AnnotationHub"))
## Loading required package: GEOquery
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
## Loading required package: ALL
## Loading required package: hgu95av2.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: org.Hs.eg.db
##
##
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: airway
## Loading required package: SummarizedExperiment
## Loading required package: biomaRt
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:biomaRt':
##
## select
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, regroup, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: minfiData
## Loading required package: minfi
## Loading required package: lattice
## Loading required package: Biostrings
## Loading required package: XVector
## Loading required package: bumphunter
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: locfit
## locfit 1.5-9.1 2013-03-22
## Loading required package: IlluminaHumanMethylation450kmanifest
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
## Loading required package: GenomicFeatures
## Loading required package: AnnotationHub
##
## Attaching package: 'AnnotationHub'
## The following object is masked from 'package:dplyr':
##
## query
## The following object is masked from 'package:Biobase':
##
## cache
## GEOquery Biobase
## TRUE TRUE
## ALL hgu95av2.db
## TRUE TRUE
## GenomicRanges airway
## TRUE TRUE
## biomaRt dplyr
## TRUE TRUE
## minfiData TxDb.Hsapiens.UCSC.hg19.knownGene
## TRUE TRUE
## AnnotationHub
## TRUE
data(ALL)
sample5 <- ALL[,5]
# find mean expression for sample5
mean(exprs(sample5))
## [1] 5.629627
# connect to Ensembl
mart <- useMart(host = 'feb2014.archive.ensembl.org',
biomart = "ENSEMBL_MART_ENSEMBL")
ensembl <- useDataset("hsapiens_gene_ensembl", mart)
names <- featureNames(ALL)
# find affymatrix attributes
attrs <- listAttributes(ensembl, page = "feature_page")
# return results
result <- getBM(attributes = c("affy_hg_u95av2",
"ensembl_gene_id",
"chromosome_name"),
filters = "affy_hg_u95av2",
values = names,
mart = ensembl)
tail(result)
## affy_hg_u95av2 ensembl_gene_id chromosome_name
## 13419 34906_g_at ENSG00000272706 HG1350_HG959_PATCH
## 13420 32208_at ENSG00000262924 HSCHR19_1_CTG3_1
## 13421 41220_at ENSG00000261843 HSCHR17_3_CTG4
## 13422 1436_at ENSG00000271903 HSCHR22_2_CTG1
## 13423 35778_at ENSG00000262907 HSCHR2_2_CTG12
## 13424 39086_g_at ENSG00000262771 HSCHR7_1_CTG6
# group result by affy_hg_u95av2
prob_set <- result %>%
group_by (affy_hg_u95av2) %>%
summarise(prob_count = n())
sum(prob_set$prob_count > 1)
## [1] 1045
result_autosome <- subset(result, chromosome_name < 23)
prob_set_autosome <- result_autosome %>%
group_by (affy_hg_u95av2) %>%
summarise(prob_count = n())
#tail(result_autosome)
sum(prob_set_autosome$prob_count > 0)
## [1] 7494
data(MsetEx)
#pData(MsetEx) # 5723646052_R04C01
sample_2 <- MsetEx[,2] # return a MethylSet for sample #2
mean(getMeth(sample_2)) # produce mean of MethylSet for sample #2
## [1] 7228.277
# library(GEOquery)
query <- getGEO("GSE788")
## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSEnnn/GSE788/matrix/
## Found 1 file(s)
## GSE788_series_matrix.txt.gz
## File stored at:
## /tmp/Rtmpf7pu3s/GPL85.soft
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : not all columns named in 'colClasses' exist
data <- query[[1]]
#pData(data)
GSM9024 <- data[,2]
mean(exprs(GSM9024))
## [1] 756.432
# library(airway)
data(airway)
mean(airway$avgLength)
## [1] 113.75
SRR1039512 <- airway[,3]
counts <- assay(SRR1039512, "counts")
sum(counts >= 1) # summation(subset counts which are greater than or equal to 1)
## [1] 25699
# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
exons <- exons(txdb) # extract exon info
autosome <- paste0("chr", c(1:22)) # autosomes = chr1:22
df <- data.frame(seqnames = autosome)
exons <- keepSeqlevels(exons, autosome) # use only autosomal exons
ncbiStyleLevels <- mapSeqlevels(seqlevels(exons),"NCBI")
exons <- renameSeqlevels(exons, ncbiStyleLevels)
subset <- subsetByOverlaps(airway, exons)
subset
## class: RangedSummarizedExperiment
## dim: 26276 8
## metadata(1): ''
## assays(1): counts
## rownames(26276): ENSG00000000419 ENSG00000000457 ...
## ENSG00000273471 ENSG00000273477
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
SRR1039508 <- airway[,1]
subset_SRR1039508 <- subsetByOverlaps(SRR1039508, exons)# use only autosomal exons
counts <- assay(SRR1039508, "counts")
subset_counts <- assay(subset_SRR1039508, "counts")
sum(subset_counts)/sum(counts) # probability (where: 0 < p < 1)
## [1] 0.9004193
# library(AnnotationHub)
ah <- AnnotationHub()
## snapshotDate(): 2016-10-11
qah_h1 <- query(ah, c("E096", "H3K4me3"))
h1 <- qah_h1[["AH30596"]] # AH30596 | E096-H3K4me3.narrowPeak.gz
## require("rtracklayer")
## loading from cache '/home/mcc/.AnnotationHub/36036'
h1 <- keepSeqlevels(h1, autosome)
h1 <- renameSeqlevels(h1, ncbiStyleLevels)
t <- range(rowRanges(subset_SRR1039508))
ncbiByGroup <- extractSeqlevelsByGroup(species = "Homo sapiens",
style = "NCBI",
group = "auto")
t <- keepSeqlevels(t, ncbiByGroup)
p <- promoters(t)
ov <- subsetByOverlaps(p, h1)
t2 <- subsetByOverlaps(SRR1039508, ov)
counts <- assay(t2, "counts")
median(counts)
## [1] 205
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rtracklayer_1.32.2
## [2] AnnotationHub_2.4.2
## [3] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [4] GenomicFeatures_1.24.5
## [5] minfiData_0.14.0
## [6] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.2.1
## [7] IlluminaHumanMethylation450kmanifest_0.4.0
## [8] minfi_1.18.6
## [9] bumphunter_1.12.0
## [10] locfit_1.5-9.1
## [11] iterators_1.0.8
## [12] foreach_1.4.3
## [13] Biostrings_2.40.2
## [14] XVector_0.12.1
## [15] lattice_0.20-34
## [16] dplyr_0.5.0
## [17] biomaRt_2.28.0
## [18] airway_0.106.2
## [19] SummarizedExperiment_1.2.3
## [20] GenomicRanges_1.24.3
## [21] GenomeInfoDb_1.8.7
## [22] hgu95av2.db_3.2.3
## [23] org.Hs.eg.db_3.3.0
## [24] AnnotationDbi_1.34.4
## [25] IRanges_2.6.1
## [26] S4Vectors_0.10.3
## [27] ALL_1.14.0
## [28] GEOquery_2.38.4
## [29] Biobase_2.32.0
## [30] BiocGenerics_0.18.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.2.1 nor1mix_1.2-2
## [3] splines_3.3.1 shiny_0.14.1
## [5] assertthat_0.1 interactiveDisplayBase_1.10.3
## [7] doRNG_1.6 Rsamtools_1.24.0
## [9] yaml_2.1.13 RSQLite_1.0.0
## [11] limma_3.28.21 quadprog_1.5-5
## [13] chron_2.3-47 digest_0.6.10
## [15] RColorBrewer_1.1-2 httpuv_1.3.3
## [17] htmltools_0.3.5 preprocessCore_1.34.0
## [19] Matrix_1.2-7.1 plyr_1.8.4
## [21] siggenes_1.46.0 XML_3.98-1.4
## [23] genefilter_1.54.2 zlibbioc_1.18.0
## [25] xtable_1.8-2 BiocParallel_1.6.6
## [27] tibble_1.2 openssl_0.9.4
## [29] annotate_1.50.1 beanplot_1.2
## [31] pkgmaker_0.22 lazyeval_0.2.0
## [33] mime_0.5 survival_2.39-5
## [35] magrittr_1.5 mclust_5.2
## [37] evaluate_0.10 nlme_3.1-128
## [39] MASS_7.3-45 BiocInstaller_1.22.3
## [41] data.table_1.9.6 tools_3.3.1
## [43] registry_0.3 formatR_1.4
## [45] matrixStats_0.51.0 stringr_1.1.0
## [47] rngtools_1.2.4 base64_2.0
## [49] grid_3.3.1 RCurl_1.95-4.8
## [51] bitops_1.0-6 rmarkdown_1.0
## [53] codetools_0.2-15 multtest_2.28.0
## [55] curl_2.1 DBI_0.5-1
## [57] reshape_0.8.5 R6_2.2.0
## [59] illuminaio_0.14.0 GenomicAlignments_1.8.4
## [61] knitr_1.14 stringi_1.1.2
## [63] Rcpp_0.12.7