This is a combination of testing Methylkit as well as R Notebook, there may be some rough spots.
We begin by loading some required packages, Rcpp may not be required, but methylkit documentation suggests it, so it got loaded. Also sets our working directory, which is both where our output will be directed towards, as well as the location of our data files.
invisible(require("Rcpp"))
invisible(source("http://bioconductor.org/biocLite.R"))
Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
invisible(biocLite(c("GenomeRanges", "IRanges")))
BioC_mirror: https://bioconductor.org
Using Bioconductor 3.4 (BiocInstaller 1.24.0), R 3.3.1 (2016-06-21).
Installing package(s) ‘GenomeRanges’, ‘IRanges’
package ‘GenomeRanges’ is not available (for R version 3.3.1)trying URL 'https://bioconductor.org/packages/3.4/bioc/src/contrib/IRanges_2.8.1.tar.gz'
Content type 'application/x-gzip' length 477107 bytes (465 KB)
==================================================
downloaded 465 KB
* installing *source* package ‘IRanges’ ...
** libs
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c CompressedAtomicList_utils.c -o CompressedAtomicList_utils.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c CompressedIRangesList_class.c -o CompressedIRangesList_class.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c CompressedList_class.c -o CompressedList_class.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c Grouping_class.c -o Grouping_class.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c IRanges_class.c -o IRanges_class.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c IRanges_constructor.c -o IRanges_constructor.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c NCList.c -o NCList.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c R_init_IRanges.c -o R_init_IRanges.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c Ranges_class.c -o Ranges_class.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c Ranges_comparison.c -o Ranges_comparison.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c RleViews_utils.c -o RleViews_utils.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c S4Vectors_stubs.c -o S4Vectors_stubs.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c SimpleRangesList_class.c -o SimpleRangesList_class.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c coverage_methods.c -o coverage_methods.o
gcc -std=gnu99 -I/usr/share/R/include -DNDEBUG -I"/home/sean/R/x86_64-pc-linux-gnu-library/3.3/S4Vectors/include" -fpic -g -O2 -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -g -c inter_range_methods.c -o inter_range_methods.o
gcc -std=gnu99 -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -Wl,-z,relro -o IRanges.so CompressedAtomicList_utils.o CompressedIRangesList_class.o CompressedList_class.o Grouping_class.o IRanges_class.o IRanges_constructor.o NCList.o R_init_IRanges.o Ranges_class.o Ranges_comparison.o RleViews_utils.o S4Vectors_stubs.o SimpleRangesList_class.o coverage_methods.o inter_range_methods.o -L/usr/lib/R/lib -lR
installing to /home/sean/R/x86_64-pc-linux-gnu-library/3.3/IRanges/libs
** R
** inst
** preparing package for lazy loading
Warning: package ‘S4Vectors’ was built under R version 3.3.2
Creating a generic function for ‘window<-’ from package ‘stats’ in package ‘IRanges’
Creating a generic function for ‘rev’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘stack’ from package ‘utils’ in package ‘IRanges’
Creating a generic function for ‘mean’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘split<-’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘drop’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘ifelse’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘diff’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘median’ from package ‘stats’ in package ‘IRanges’
Creating a generic function for ‘quantile’ from package ‘stats’ in package ‘IRanges’
Creating a generic function for ‘smoothEnds’ from package ‘stats’ in package ‘IRanges’
Creating a generic function for ‘runmed’ from package ‘stats’ in package ‘IRanges’
Creating a generic function for ‘chartr’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘tolower’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘toupper’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘sub’ from package ‘base’ in package ‘IRanges’
Creating a generic function for ‘gsub’ from package ‘base’ in package ‘IRanges’
** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded
Warning: package ‘S4Vectors’ was built under R version 3.3.2
* DONE (IRanges)
The downloaded source packages are in
‘/tmp/RtmpD1nHGE/downloaded_packages’
invisible(require("devtools"))
invisible(install_github("al2na/methylKit", build_vignettes = FALSE, repos = BiocInstaller::biocinstallRepos(), dependencies = TRUE))
Skipping install of 'methylKit' from a github remote, the SHA1 (50f71b8e) has not changed since last install.
Use `force = TRUE` to force installation
invisible(setwd("/home/sean/Documents/Bismark/BismarkData/BismarkOutput/cov_working"))
Next we set up some variables for methylkit. olyLables are the individual sample names, file_labels removes the .fastq.gz and appends the appropriate bismark generated text for coverage files. Sample IDs denote the 8 different samples, and treatment is a factor for treatment.
olyLabels <- c("1_ATCACG_L001_R1_001.fastq.gz", "2_CGATGT_L001_R1_001.fastq.gz",
"3_TTAGGC_L001_R1_001.fastq.gz", "4_TGACCA_L001_R1_001.fastq.gz",
"5_ACAGTG_L001_R1_001.fastq.gz", "6_GCCAAT_L001_R1_001.fastq.gz",
"7_CAGATC_L001_R1_001.fastq.gz", "8_ACTTGA_L001_R1_001.fastq.gz")
file_labels <- as.list(paste0(gsub(".fastq.gz", "", olyLabels), "_bismark_bt2.bismark.cov"))
sample_id <- list("1", "2", "3", "4", "5", "6", "7", "8")
treatment <- c(0,0,0,0,1,1,1,1)
Next blob runs initial Methylkit data importation step, creating a methylrawlistDB object. The mincov argument sets minimum coverage level required, 1 includes all data. Next filterByCoverage filters based on high and low read cutoffs, high to eliminate PCR bias, and low to increase statistical efficacy. Picked 3 based on talking to Hollie. The dim commands show how many samples are removed via filtering.
testObj.1 <- methRead( file_labels, sample_id, assembly = "test", mincov = 1, pipeline = 'bismarkCoverage', context = "CpG", treatment = treatment)
dim(testObj.1[[1]])
[1] 865501 7
filtered.testObj.1.3 <- filterByCoverage(testObj.1, lo.count = 3, low.perc = NULL, hi.count = NULL, high.perc = 95)
dim(filtered.testObj.1.3[[1]])
[1] 116410 7
Plots a histogram of coverage by base for each sample. Uncomment lines to get output
getMethylationStats(filtered.testObj.1.3[[1]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[2]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[3]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[4]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[5]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[6]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[7]], plot = T, both.strands = F)
getMethylationStats(filtered.testObj.1.3[[8]], plot = T, both.strands = F)
Plots coverage histogram
getCoverageStats(filtered.testObj.1.3[[1]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[2]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[3]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[4]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[5]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[6]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[7]], plot = T, both.strands = F)
getCoverageStats(filtered.testObj.1.3[[8]], plot = T, both.strands = F)
Merge samples into single object.
meth <- unite(filtered.testObj.1.3, destrand = F)
dim(meth)
[1] 29970 28
getCorrelation(meth, plot = F)
1 2 3 4 5 6 7 8
1 1.0000000 0.8157266 0.8220120 0.8192927 0.8163059 0.8126666 0.8099266 0.8134888
2 0.8157266 1.0000000 0.8230040 0.8272867 0.8193768 0.8195559 0.8166163 0.8255220
3 0.8220120 0.8230040 1.0000000 0.8306129 0.8379462 0.8245320 0.8247905 0.8275256
4 0.8192927 0.8272867 0.8306129 1.0000000 0.8220671 0.8229429 0.8170476 0.8211265
5 0.8163059 0.8193768 0.8379462 0.8220671 1.0000000 0.8237930 0.8154873 0.8292483
6 0.8126666 0.8195559 0.8245320 0.8229429 0.8237930 1.0000000 0.8181590 0.8291018
7 0.8099266 0.8166163 0.8247905 0.8170476 0.8154873 0.8181590 1.0000000 0.8221480
8 0.8134888 0.8255220 0.8275256 0.8211265 0.8292483 0.8291018 0.8221480 1.0000000
Plots clustergram and PCA, uncomment lines to output to file in working directory
par(mfrow = c(1,1))
#pdf(file = "clustergram.pdf")
CpGDendro <- clusterSamples(meth, dist = "correlation", method = "ward", plot = T)
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
#dev.off()
#pdf(file = "PCA.pdf")
PCA <- PCASamples(meth, scale = T, center = T)
#dev.off()
#pdf(filename = "screeplot.pdf")
PCASamples(meth, screeplot = T)
#dev.off()