Create an ExperimentHub containing metadata for all curatedMetagenomicData
records:
suppressPackageStartupMessages(library(ExperimentHub))
eh = ExperimentHub()
## snapshotDate(): 2016-10-26
myquery = query(eh, "curatedMetagenomicData")
Subset this to bug abundance from stool datasets. Note that this could also be done using myquery$tags
(once the tags are updated). See the available metadata using View(mcols(myquery))
.
myquery.stool <- myquery[grepl("stool", myquery$title) & grepl("bugs", myquery$title), ]
myquery.stool
## ExperimentHub with 11 records
## # snapshotDate(): 2016-10-26
## # $dataprovider: Institute of Microbiology and Infection, University of...
## # $species: Homo Sapiens
## # $rdataclass: ExpressionSet
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["EH236"]]'
##
## title
## EH236 | HMP_2012.metaphlan_bugs_list.stool
## EH277 | KarlssonFH_2013.metaphlan_bugs_list.stool
## EH283 | LeChatelierE_2013.metaphlan_bugs_list.stool
## EH289 | LomanNJ_2013_Hi.metaphlan_bugs_list.stool
## EH295 | LomanNJ_2013_Mi.metaphlan_bugs_list.stool
## ... ...
## EH307 | Obregon-TitoAJ_2015.metaphlan_bugs_list.stool
## EH319 | QinJ_2012.metaphlan_bugs_list.stool
## EH325 | QinN_2014.metaphlan_bugs_list.stool
## EH331 | RampelliS_2015.metaphlan_bugs_list.stool
## EH361 | ZellerG_2014.metaphlan_bugs_list.stool
Create a list of ExpressionSet objects:
eset.list <- lapply(names(myquery.stool), function(x) myquery.stool[[x]])
Give them simplified titles:
names(eset.list) <- myquery.stool$title
names(eset.list) <- gsub("\\..+", "", myquery.stool$title)
And add the titles to the colnames:
for (i in 1:length(eset.list)){
colnames(eset.list[[i]]) <- paste(names(eset.list)[[i]],
colnames(eset.list[[i]]),
sep=".")
pData(eset.list[[i]]) <- pData(eset.list[[i]])[, !sapply(pData(eset.list[[i]]), function(x) all(is.na(x)))]
eset.list[[i]]$subjectID <- as.character(eset.list[[i]]$subjectID)
}
ExpressionSet
joinWithRnames <- function(obj, FUN = I) {
mylist <- lapply(obj, function(x) {
df <- data.frame(FUN(x))
df$rnames28591436107 <- rownames(df)
return(df)
})
bigdf <- Reduce(dplyr::full_join, mylist)
rownames(bigdf) <- make.names(bigdf$rnames28591436107)
bigdf <- bigdf[, !grepl("^rnames28591436107$", colnames(bigdf))]
return(bigdf)
}
pdat <- joinWithRnames(eset.list, FUN=pData)
pdat$study <- sub("\\..+", "", rownames(pdat))
ab <- joinWithRnames(eset.list, FUN=exprs)
ab[is.na(ab)] <- 0
eset <- ExpressionSet(assayData = as.matrix(ab),
phenoData = AnnotatedDataFrame(pdat))
eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 3463 features, 1885 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HMP_2012.SRS011061 HMP_2012.SRS011084 ...
## ZellerG_2014.CCIS98832363ST.4.0 (1885 total)
## varLabels: subjectID visit_number ... study (105 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
phyloseq
objectsuppressPackageStartupMessages(library(phyloseq))
source("https://raw.githubusercontent.com/waldronlab/presentations/master/Waldron_2016-06-07_EPIC/metaphlanToPhyloseq.R")
pseq <- metaphlanToPhyloseq(tax=exprs(eset), metadat=pData(eset), split=".")
ord = ordinate(pseq, method="PCoA", distance="bray")
plot_ordination(pseq, ord, color="disease") +
ggplot2::ggtitle("Bray-Curtis Principal Coordinates Analysis")
plot_ordination(pseq, ord, color="country") +
ggplot2::ggtitle("Bray-Curtis Principal Coordinates Analysis")
plot_ordination(pseq, ord, color="study") +
ggplot2::ggtitle("Bray-Curtis Principal Coordinates Analysis")
pseq.spstrain = subset_taxa(pseq, !is.na(Species)) #species + strain only
ord2 = ordinate(pseq.spstrain, method="PCoA", distance="bray")
plot_ordination(pseq.spstrain, ord, color="study") +
ggplot2::ggtitle("Bray-Curtis Principal Coordinates Analysis")
for (i in 1:length(pdat)){
cat(names(pdat)[[i]])
cat("\n")
if(is(pdat[[i]], "numeric")){
print(summary(pdat[[i]]))
}else{
if(length(unique(pdat[[i]])) <= 30)
print(table(pdat[[i]]))
}
cat("\n")
}
## subjectID
##
## visit_number
##
## 1 2 3
## 86 57 4
##
## bodysite
##
## stool
## 1885
##
## snprnt
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 700013715 700024542 700038354 700062686 700103016 700119496 1738
##
## gender
##
## female male
## 640 735
##
## wmsphase
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 1 1 1 1 1 1738
##
## disease
##
## cancer cirrhosis
## 53 123
## ibd_crohn_disease ibd_ulcerative_colitis
## 21 127
## impaired_glucose_tolerance large_adenoma
## 49 15
## leaness n
## 96 840
## n_relative obese
## 47 5
## obesity overweight
## 169 10
## small_adenoma stec2-positive
## 27 53
## t2d underweight
## 223 1
##
## country
##
## china denmark estonia finland france germany
## 600 471 1 1 157 57
## hungary iceland italy norway peru slovakia
## 1 1 11 1 36 1
## spain sweden tanzania usa yugoslavia
## 219 130 27 169 2
##
## age
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 35.00 49.00 47.79 62.00 89.00 367
##
## number_reads
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 164938 33202947 49691540 53972539 66357610 238641707
##
## classification
##
## igt ngt t2d
## 49 43 53
##
## gad.antibodies
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.2000 0.4000 0.9621 0.8000 56.6000 1740
##
## bmi
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 14.05 21.62 24.45 25.57 29.00 46.60 319
##
## whr
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.7300 0.8200 0.8600 0.8597 0.9000 1.0500 1740
##
## wc
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 66.0 82.0 90.0 89.4 95.0 117.0 1740
##
## cholesterol
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.420 4.730 5.410 5.451 6.100 8.710 1740
##
## triglycerides
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.420 0.840 1.040 1.301 1.500 6.350 1740
##
## hdl
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.300 1.030 1.245 1.383 1.583 6.790 1409
##
## ldl
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.140 2.640 3.210 3.229 3.830 7.120 1405
##
## creatinine
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 44.00 60.00 68.00 69.79 74.00 182.00 1740
##
## y.gt
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1400 0.2600 0.3600 0.5374 0.5700 4.6000 1740
##
## fasting_glucose
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.900 5.300 5.800 6.383 6.825 16.500 1741
##
## fasting_insulin
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.8 6.2 8.8 11.8 13.0 66.0 1740
##
## hba1c
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 26.00 36.00 38.00 40.64 43.00 77.00 1740
##
## adiponectin
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.680 9.595 11.900 13.361 15.725 34.800 1741
##
## leptin
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.17 12.57 21.65 24.94 33.50 105.00 1741
##
## glp.1
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.300 0.900 1.774 1.700 17.800 1755
##
## fgf.19
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16.40 71.64 107.83 138.50 160.03 782.18 1755
##
## hscrp
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.130 0.940 1.530 4.117 3.480 99.220 1740
##
## c.peptide
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.1200 0.6300 0.8100 0.9062 1.0700 2.8000 1740
##
## tnfa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.750 1.560 2.030 2.162 2.530 7.100 1740
##
## il.1
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.9818 0.0000 16.8500 1755
##
## cd163
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 109.8 610.9 764.9 786.4 902.2 2276.5 1755
##
## statins
##
## n y
## 93 52
##
## insulin
##
## n y
## 139 6
##
## oral_anti.diabetic_medication
##
## met sulph
## 20 2
##
## years_in_sweden
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.0 34.5 38.0 38.2 40.0 60.0 1870
##
## ethnicity
##
## asian white
## 600 1011
##
## paired_end_insert_size
##
## read_length
##
## 44 44/75 75 90
## 14 1 162 115
##
## matched_reads
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 14251095 30895897 39132640 42448127 53507661 122587818 1593
##
## uniquely_matching_reads
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13112580 26606088 33623355 36814287 46519632 108713556 1593
##
## uniquely_matched_reads
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 7044202 15308787 20383454 21808160 27785946 60517895 1593
##
## gene_number
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 91032 560710 686014 670528 784221 1005488 1593
##
## gene_number_for_11M_uniquely_matched_reads
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 59147 493405 599938 578512 679028 878816 1593
##
## hitchip_probe_number
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 524 1166 1264 1255 1372 1650 1593
##
## gene_count_class
##
## hgc lgc
## 224 68
##
## hitchip_probe_class
##
## hpc lpc
## 227 65
##
## first
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1122 2723 2934 3122 3878 5066 1833
##
## repeat.
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.6346 0.0000 5.0000 1833
##
## stooltexture
##
## bloody smooth watery
## 8 17 11
##
## daysafteronset
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 3.00 5.50 6.71 8.00 26.00 1847
##
## hus
##
## n y
## 26 12
##
## stec_count
##
## high low moderate
## 17 16 5
##
## shigatoxin2elisa
##
## negative positive
## 12 26
##
## readsmillions
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 7.50 10.97 14.35 22.98 19.57 110.00 1833
##
## nonhuman
##
## >99 0 1.86 15 19 21 29 31 32 33 33.04 35
## 24 1 1 2 2 1 1 1 1 1 1 1
## 36 40 46 6 67.51 69 78 9 90 99
## 1 1 1 1 1 1 1 1 1 6
##
## stec_coverage
##
## <1 1 10 11 16 17 19 2 22 29 3 39 4 5 6 619 7 8
## 11 2 1 2 1 1 1 4 1 1 3 1 3 1 1 1 1 1
## 9
## 1
##
## stxab_detected
##
## n y
## 19 29
##
## stx_ratio
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.400 1.000 1.000 2.793 2.500 13.000 1870
##
## typingdata
##
## n y
## 14 24
##
## c_difficile_frequency
##
## 0.0006 0.0009 0.0014 0.0036 0.0037 0.004 0.0041 0.0069
## 1 1 2 1 1 1 1 1
## negative positive
## 40 3
##
## sampling_day
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 407.5 505.0 482.9 587.0 954.0 1666
##
## dfmp
##
## dfmp
## 19
##
## mgs_richness
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 33.0 124.0 156.0 154.6 192.0 307.0 1492
##
## mgs_profile_matched_sample_pairs
##
## population
##
## matses norman tunapuco
## 24 22 12
##
## bmi_class
##
## healthy obese overweight underweight
## 35 5 10 1
##
## X16s_rrna
##
## y
## 58
##
## shotgun_metagenome
##
## y
## 58
##
## stage
##
## discovery stage_I stage_II validation
## 181 145 199 56
##
## height
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 147 158 165 164 170 186 1541
##
## weight
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 36.00 55.00 62.00 62.92 70.00 121.50 1541
##
## diabetic
##
## n y
## 174 170
##
## fbg
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.130 5.050 6.215 7.159 8.785 33.000 1541
##
## sbp
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 84.0 112.5 124.5 125.1 135.0 210.0 1559
##
## dbp
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 45.00 70.00 78.00 76.53 80.00 121.00 1559
##
## fins
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.600 7.827 12.395 13.047 16.883 34.600 1689
##
## fcp
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.040 1.100 1.420 1.684 2.360 4.540 1840
##
## hbalc
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.200 5.400 5.850 6.814 7.500 13.800 1555
##
## tg
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.420 1.060 1.480 1.707 2.080 8.710 1549
##
## tcho
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.030 4.497 5.040 5.073 5.725 9.110 1549
##
## cirrhotic
##
## n y
## 114 123
##
## hbv_related
##
## n y
## 138 99
##
## alcohol_related
##
## n y
## 203 34
##
## other_causes_related
##
## autoimmune_related
## 1
## drug_related
## 1
## hepatitis_c_virus_related
## 1
## hepatitis_d_virus_related
## 5
## hepatitis_e_virus_related
## 8
## hepatolenticular_degeneration_related
## 2
## n
## 214
## primary_biliary_cirrhosis_&_autoimmune_related
## 1
## schistosoma_related
## 2
## schistosoma?hepatitis_e_virus_and_autoimmune_related
## 1
## schistosoma?hepatitis_e_virus_related
## 1
##
## inr
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.920 1.145 1.280 1.351 1.465 2.350 1762
##
## crea
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 30.00 53.00 63.00 65.01 76.00 163.00 1648
##
## alb
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.20 33.50 43.50 40.62 48.70 57.60 1648
##
## tb
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.00 12.00 16.00 41.12 30.00 597.00 1648
##
## pt
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.00 13.45 14.70 15.63 17.05 27.00 1762
##
## ascites
##
## absent mild sever
## 57 37 29
##
## he
##
## grade_1 grade_4 none
## 2 1 120
##
## ctp
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.000 6.000 7.000 7.691 9.000 14.000 1762
##
## meld
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -6.10 4.15 7.40 8.30 11.95 23.40 1762
##
## antivirus
##
## adefovir entecavir
## 2 11
## lamivudine lamivudine_adefovir
## 9 27
## lamivudine_foscarnet_sodium none
## 1 71
## telbivudine
## 2
##
## beta.blocker
##
## none propranolol
## 111 12
##
## camp
##
## bologna dedauko sengele
## 11 20 7
##
## tnm_stage
##
## t1n0m0 t2n0m0 t2n1m0 t2n1m1 t3n0m0 t3n0m1 t3n1m0 t3n1m1 t3nxm1 t4n0m0
## 9 6 1 1 6 1 7 9 1 1
## t4n0m1 t4n1m0 t4n1m1
## 1 2 8
##
## ajcc_stage
##
## i ii iii iv
## 15 7 10 21
##
## localization
##
## lc lc/rc rc rectum sigma
## 35 6 29 16 9
##
## fobt
##
## negative positive
## 121 34
##
## wif.1_gene_methylation_test
##
## negative positive
## 125 27
##
## group
##
## control crc
## 88 53
##
## study
##
## HMP_2012 KarlssonFH_2013 LeChatelierE_2013 LomanNJ_2013_Hi
## 147 145 292 44
## LomanNJ_2013_Mi NielsenHB_2014 Obregon QinJ_2012
## 9 396 58 363
## QinN_2014 RampelliS_2015 ZellerG_2014
## 237 38 156
sessionInfo()
## R Under development (unstable) (2016-10-26 r71594)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X Yosemite 10.10.5
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] curatedMetagenomicData_1.0.0 phyloseq_1.19.0
## [3] magrittr_1.5 ExperimentHubData_1.1.0
## [5] AnnotationHubData_1.5.0 futile.logger_1.4.3
## [7] GenomicRanges_1.27.2 GenomeInfoDb_1.11.0
## [9] IRanges_2.9.0 S4Vectors_0.13.1
## [11] Biobase_2.35.0 ExperimentHub_1.1.0
## [13] AnnotationHub_2.7.0 BiocGenerics_0.21.0
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-128 bitops_1.0-6
## [3] httr_1.2.1 tools_3.4.0
## [5] R6_2.2.0 vegan_2.4-1
## [7] mgcv_1.8-15 DBI_0.5-1
## [9] colorspace_1.2-7 permute_0.9-4
## [11] ade4_1.7-4 curl_2.2
## [13] chron_2.3-47 graph_1.53.0
## [15] BiocCheck_1.11.0 formatR_1.4
## [17] xml2_1.0.0 labeling_0.3
## [19] rtracklayer_1.35.0 scales_0.4.0
## [21] RBGL_1.51.0 stringr_1.1.0
## [23] digest_0.6.10 Rsamtools_1.27.2
## [25] rmarkdown_1.1 GEOquery_2.41.0
## [27] AnnotationForge_1.17.1 XVector_0.15.0
## [29] rBiopaxParser_2.15.0 htmltools_0.3.5
## [31] RSQLite_1.0.0 BiocInstaller_1.25.2
## [33] shiny_0.14.1 jsonlite_1.1
## [35] BiocParallel_1.9.0 RCurl_1.95-4.8
## [37] biomformat_1.3.0 Matrix_1.2-7.1
## [39] Rcpp_0.12.7 munsell_0.4.3
## [41] ape_3.5 stringi_1.1.2
## [43] yaml_2.1.13 MASS_7.3-45
## [45] SummarizedExperiment_1.5.1 zlibbioc_1.21.0
## [47] rhdf5_2.19.0 biocViews_1.43.1
## [49] plyr_1.8.4 grid_3.4.0
## [51] lattice_0.20-34 Biostrings_2.43.0
## [53] splines_3.4.0 multtest_2.31.0
## [55] GenomicFeatures_1.27.0 knitr_1.14
## [57] igraph_1.0.1 optparse_1.3.2
## [59] RUnit_0.4.31 reshape2_1.4.2
## [61] codetools_0.2-15 biomaRt_2.31.1
## [63] futile.options_1.0.0 XML_3.98-1.4
## [65] evaluate_0.10 lambda.r_1.1.9
## [67] data.table_1.9.6 httpuv_1.3.3
## [69] foreach_1.4.3 gtable_0.2.0
## [71] getopt_1.20.0 assertthat_0.1
## [73] ggplot2_2.1.0 mime_0.5
## [75] xtable_1.8-2 survival_2.39-5
## [77] tibble_1.2 OrganismDbi_1.17.1
## [79] iterators_1.0.8 GenomicAlignments_1.11.0
## [81] AnnotationDbi_1.37.0 cluster_2.0.5
## [83] interactiveDisplayBase_1.13.0