directory: /gpfs/alpine/syb105/proj-shared/Data/NCBI-Mirror/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/ gtf file: GCF_000001405.39_GRCh38.p13_genomic.gtf fasta file: GCF_000001405.39_GRCh38.p13_genomic.fna
# copy all file names in trimmed directory and print to text file --> 358 files (179 R1/R2 pairs)
ls /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/ATACseq_trimmed >> /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/trimmed.files.txt
# generate key file with path, R1, R2
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/atac.key.txt
# bwa mem
# git clone https://github.com/lh3/bwa.git
# bwa index ref.fa
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/atac.key.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
d = fields[0]
r1 = fields[1]
r2 = fields[2]
genome = "2862010578"
name = "opioid.atac"
extsize = "147"
fasta = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/bwa/bwa index -a bwtsw /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna GCF_000001405.39_GRCh38.p13_genomic"
bwa = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/bwa/bwa mem -t 4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna" + " " + "'<zcat " + d + "ATACseq_trimmed/trimmed_" + r1 + ".fastq.gz'" + " " + "'<zcat " + d + "ATACseq_trimmed/trimmed_" + r2 + ".fastq.gz'" + " " + ">" + " " + "bwa.output/sam/" + r1 + ".bwa.sam"
sort = "samtools sort -@ 4 -O bam -T" + " " + "bwa.output/bam/" + r1 + ".tmp -o" + " " + "bwa.output/" + r1 + ".sorted.bam" + " " + "bwa.output/" + r1 + ".bwa.sam"
index1 = "samtools index" + " " + "bwa.output/bam/" + r1 + ".sorted.bam"
rmdups = "java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/picard.jar MarkDuplicates -I" + " " + "bwa.output/bam/" + r1 + ".sorted.bam -M" + " " + "rmdups.output/" + r1 + "_report.txt -O" + " " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam --VALIDATION_STRINGENCY SILENT --ASSUME_SORTED true --REMOVE_DUPLICATES true"
index = "samtools index" + " " + "bwa.output/" + r1 + ".rmdups.bam"
unique = "export CHROMOSOMES=$(samtools view -H" + " " + "bwa.output/" + r1 + ".rmdups.bam | grep '^@SQ' | cut -f 2 | grep -v -e _ -e chrM -e chrX -e chrY -e 'VN:' | sed 's/SN://' | xargs echo); samtools view -b -h -f 3 -F 4 -F 8 -F 256 -F 1024 -F 2048 -q 30" + " " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam" + " " + "$CHROMOSOMES >" + " " + "bwa.output/rmdups.uniq.bam/" + r1 + ".rmdups.uniq.bam"
index2 = "samtools index" + " " + "bwa.output/rmdups.uniq.bam/" + r1 + ".rmdups.uniq.bam"
bamtobed = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/bin/bedtools bamtobed -i" + " " + "bwa.output/rmdups.uniq.bed/" + r1 + ".rmdups.uniq.bam >" + " " + "bwa.output/bed/" + r1 + ".rmdups.uniq.bed"
macs2 = "macs2 callpeak -t" + " " + "bwa.output/bed/" + r1 + ".rmdups.uniq.bed -f BED -g" + " " + genome + " " + "--outdir" + " " + "macs.output/" + r1 + ".macs2" + " " + "-n" + " " + name + " " + "--keep-dup all --nomodel --extsize" + " " + extsize
macs = "macs2 callpeak --broad --SPMR -q 0.01 -t" + " " + "bwa.output/bed/" + r1 + ".rmdups.uniq.bed -f BED -g" + " " + genome + " " + "--outdir" + " " + "macs.output/qval/" + r1 + ".macs2" + " " + "-n" + " " + name + " " + "--keep-dup all --nomodel --extsize" + " " + extsize
bamtobigwig = "bamCoverage -b bwa.output/rmdups.bam/" + r1 + ".rmdups.bam" + " -bl macs.output/hg38-blacklist.v2.ensembl.bed -p 2 --effectiveGenomeSize 2862010578 --normalizeUsing CPM -of bigwig -o " + "bwa.output/rmdups.bam/bigwig/" + r1 + ".bw"
print(xx)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
# python scripts/bwa-mem.key.py > commands/bwa-mem.commands.txt
python scripts/bwa-mem.1.key.py > commands/bwa-mem.1.commands.txt
python scripts/bwa-mem.1.2.key.py > commands/bwa-mem.1.2.commands.txt
python scripts/bwa-mem.1.3.key.py > commands/bwa-mem.1.3.commands.txt
python scripts/bwa-mem.2.key.py > commands/bwa-mem.2.commands.txt
python scripts/bwa-mem.2.andes.key.py > commands/bwa-mem.2.commands.andes.txt
python scripts/bwa-mem.2.summit.key.py > commands/bwa-mem.2.commands.summit.txt
python scripts/bwa-mem.3.key.py > commands/bwa-mem.3.commands.txt
python scripts/bwa-mem.4.key.py > commands/bwa-mem.4.commands.txt
python scripts/bwa-mem.4.1.key.py > commands/bwa-mem.4.1.commands.txt
python scripts/bwa-mem.5.key.py > commands/bwa-mem.5.commands.txt
python scripts/bwa-mem.5.summit.key.py > commands/bwa-mem.5.commands.summit.txt
python scripts/bwa-mem.6.key.py > commands/bwa-mem.6.commands.txt
python scripts/bwa-mem.6.bam.key.py > commands/bwa-mem.6.bam.commands.txt
python scripts/bamtobigwig.key.py > commands/bamtobigwig.commands.txt
python scripts/macs.key.py > commands/macs.commands.txt
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
conda install -c bioconda bwa
conda install -c bioconda samtools
conda install -c bioconda bamtools
conda install -c cyclus java-jdk
conda install -c bioconda picard
conda install -c bioconda macs2
conda install -c biobuilds picard
conda install -c bioconda bedtools
conda install -c biobuilds bedtools
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name fasta --time 48:00:00 --maxpernode 32 --nodes 1 commands/bwa.fasta.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bwa --time 48:00:00 --maxpernode 32 --nodes 6 commands/bwa-mem.1.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name rmdups --time 48:00:00 --maxpernode 4 --nodes 6 commands/bwa-mem.2.commands.andes.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name macs2 --time 48:00:00 --maxpernode 4 --nodes 2 commands/bwa-mem.6.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bamtobigwig --time 48:00:00 --maxpernode 20 --nodes 2 commands/bamtobigwig.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name macs2 --time 48:00:00 --maxpernode 4 --nodes 2 commands/macs.commands.txt
# issue with conda: samtools had an issue with not finding a shared library libcrypto.so.1.0.0 --> conda install -c bioconda samtools=1.9 --force-reinstall
# not enough memory in picard on andes... try summit
# bsub -W 00:15 -nnodes 1 -P SYB105 -Is /bin/bash
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
conda install -c krinsman ijavascript
conda install -c biobuilds picard
conda install -c biobuilds samtools
conda install -c biobuilds bedtools
# https://github.com/broadinstitute/picard
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bwa.sort --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.1.2.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name index1 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.1.3.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name rmdups --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.2.commands.summit.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name index --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.3.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name unique --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.4.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name index2 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.4.1.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bamtobed --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.5.commands.summit.txt
### picard not running
https://github.com/broadinstitute/picard
### to give picard more memory usage
vi `which picard`
default_jvm_mem_opts="-Xms512m -Xmx16g" # add this line to picard script
### job submission not recognizing bedtools in conda environment... give direct path
conda info --envs
### QC: run after bwa-mem alignment and samtools sort steps complete
# samtools stat
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/atac.key.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
d = fields[0]
r1 = fields[1]
r2 = fields[2]
stats = "samtools stats" + " " + "bwa.output/bam/" + r1 + ".sorted.bam > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/samstat/" + r1 + ".samstat.txt"
stats2 = "samtools stats" + " " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/samstat/rmdups/" + r1 + ".rmdups.samstat.txt"
stats3 = "samtools stats" + " " + "bwa.output/rmdups.uniq.bam" + r1 + ".rmdups.uniq.bam > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/samstat/rmdups.uniq" + r1 + ".rmdups.uniq.samstat.txt"
print(stats)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
python scripts/bwa-mem.stats.key.py > commands/bwa-mem.stats.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name stats --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.stats.commands.txt
python scripts/bwa-mem.stats2.key.py > commands/bwa-mem.stats2.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name stats2 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.stats2.commands.txt
python scripts/bwa-mem.stats3.key.py > commands/bwa-mem.stats3.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name stats3 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.stats3.commands.txt
# multiqc
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#conda install -c bioconda multiqc
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/samstat
multiqc .
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/samstat/multiqc_report.html /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/.
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/samstat/rmdups/multiqc_report.html /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/.
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/macs.output/merge/peaks
#loadGenome.pl -name hg38.ensembl -org null -fasta /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna -gtf /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gtf
annotatePeaks.pl control.olig.merge.macs2.txt hg38.ensembl > control.olig.merge.homer.txt
# Annotation Number of peaks Total size (bp) Log2 Ratio (obs/exp) LogP enrichment (+values depleted)
# TTS 12341.0 24358381 -0.094 30.427
# Exon 26047.0 45057331 0.096 -61.405
# Intron 378637.0 696848334 0.006 -9.872
# Intergenic 320936.0 600104751 -0.016 39.839
# Promoter 15059.0 25694535 0.116 -51.195
annotatePeaks.pl control.gaba.merge.macs2.txt hg38.ensembl > control.gaba.merge.homer.txt
# Annotation Number of peaks Total size (bp) Log2 Ratio (obs/exp) LogP enrichment (+values depleted)
# TTS 10076.0 18318399 -0.042 6.348
# Exon 21661.0 34200241 0.162 -139.766
# Intron 305965.0 528132783 0.033 -166.246
# Intergenic 252012.0 462068349 -0.054 314.627
# Promoter 11518.0 19336965 0.073 -17.563
annotatePeaks.pl control.glu.merge.macs2.txt hg38.ensembl > control.glu.merge.homer.txt
# Annotation Number of peaks Total size (bp) Log2 Ratio (obs/exp) LogP enrichment (+values depleted)
# TTS 8583.0 18318399 -0.025 2.884
# Exon 18240.0 34200241 0.162 -118.592
# Intron 258274.0 528132783 0.037 -174.167
# Intergenic 210928.0 462068349 -0.062 352.490
# Promoter 10151.0 19336965 0.139 -49.948
annotatePeaks.pl heroin.olig.merge.macs2.txt hg38.ensembl > heroin.olig.merge.homer.txt
# Annotation Number of peaks Total size (bp) Log2 Ratio (obs/exp) LogP enrichment (+values depleted)
# TTS 11002.0 21471245 -0.062 12.862
# Exon 23390.0 39823510 0.135 -106.460
# Intron 331512.0 613516608 0.015 -37.857
# Intergenic 280134.0 535378500 -0.032 122.491
# Promoter 13328.0 22672596 0.136 -62.106
annotatePeaks.pl heroin.gaba.merge.macs2.txt hg38.ensembl > heroin.gaba.merge.homer.txt
# Annotation Number of peaks Total size (bp) Log2 Ratio (obs/exp) LogP enrichment (+values depleted)
# TTS 10230.0 18318399 -0.042 6.467
# Exon 21749.0 34200241 0.146 -114.597
# Intron 311170.0 528132783 0.036 -192.169
# Intergenic 255688.0 462068349 -0.055 333.335
# Promoter 11669.0 19336965 0.070 -16.406
annotatePeaks.pl heroin.glu.merge.macs2.txt hg38.ensembl > heroin.glu.merge.homer.txt
# Annotation Number of peaks Total size (bp) Log2 Ratio (obs/exp) LogP enrichment (+values depleted)
# TTS 8469.0 18318399 0.005 -1.003
# Exon 18043.0 34200241 0.196 -168.254
# Intron 251065.0 528132783 0.046 -252.926
# Intergenic 201607.0 462068349 -0.078 530.687
# Promoter 10008.0 19336965 0.168 -70.031
## Make a custom "SAF" file which featureCounts needs:
awk 'OFS="\t" {print $1"-"$2+1"-"$3, $2+1, $3, "+"}' foo.narrowPeak > foo.saf
## run featureCounts (add -T for multithreading)
featureCounts -p -a peaks.saf -F SAF -o out.txt atacseq.bam
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# conda install -c bioconda subread
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
awk 'OFS="\t" {print $1"-"$2+1"-"$3, $2+1, $3, "+"}' bwa.output/macs.output/OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2/opioid.atac_peaks.narrowPeak > bwa.output/macs.output/OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2/opioid.atac.saf
featureCounts -p -a bwa.output/macs.output/OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2/opioid.atac.saf -F SAF -o bwa.output/macs.output/OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2/opioid.atac.frip.txt bwa.output/rmdups.bam/OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.rmdups.bam
# chrom sizes for proper human genome fasta file
# samtools faidx genome.fa | cut -f1,2 > chromsizes
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/
cut -f1,2 GCF_000001405.39_GRCh38.p13_genomic.fna.fai > GRCh38.p13.size
grep 'NC_0000' GRCh38.p13.size > GRCh38.p13.chr.size
# make 100bp windows file
bedtools makewindows -g GRCh38.p13.chr.size -w 100 > GRCh38.p13.windows.bed
bedtools makewindows -g GRCh38.p13.chr.size -w 1000 > GRCh38.p13.windows.1kb.bed
bedtools makewindows -g GRCh38.p13.chr.size -w 500 > GRCh38.p13.windows.500.bed
# bedtools coverage [OPTIONS] -a <FILE> -b <FILE1, FILE2, ..., FILEN>
### summit
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/atac.key.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
d = fields[0]
r1 = fields[1]
r2 = fields[2]
bins = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/bin/bedtools coverage -abam " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/" + r1 + ".rmdups.bam -b /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.windows.bed > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/bins/" + r1 + ".bins.bed"
print(bins)
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/bin/bedtools coverage -b /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/GABA-405_CGAGGCTG-GAGCCTTA_HGHM2DSXY_L004_001.R1.rmdups.bam -a /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.windows.bed > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/bins/GABA-405_CGAGGCTG-GAGCCTTA_HGHM2DSXY_L004_001.R1.bins.bed
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
#python scripts/bins.cov.key.py > commands/bins.cov.commands.txt
#/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov --time 02:00:00 --maxpernode 2 --nodes 2 commands/bins.cov.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov1 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands1.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov2 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands2.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov3 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands3.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov4 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands4.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov5 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands5.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov6 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands6.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov7 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands7.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov8 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands8.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov9 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands9.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov10 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands10.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov11 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands11.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bins.cov12 --time 02:00:00 --maxpernode 1 --nodes 5 commands/bins.cov.commands12.txt
# try different method since bamtools is using too much memory?
multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
multiBamSummary bins --smartLabels -bs 100 -p 2 -o merge.npz --bamfiles control.olig.merge.bam control.gaba.merge.bam control.glu.merge.bam heroin.olig.merge.bam heroin.gaba.merge.bam heroin.glu.merge.bam
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.summary --time 24:00:00 --maxpernode 2 --nodes 3 ../../commands/bam.summary.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.summary --time 48:00:00 --maxpernode 4 --nodes 1 ../../commands/bam.summary.all.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.merge.summary --time 48:00:00 --maxpernode 4 --nodes 1 ../../commands/bam.summary.merge.commands.txt
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
plotPCA -in control.olig.npz -o control.olig.pca.png
plotPCA -in control.gaba.npz -o control.gaba.pca.png
plotPCA -in control.glu.npz -o control.glu.pca.png
plotPCA -in heroin.olig.npz -o heroin.olig.pca.png
plotPCA -in heroin.gaba.npz -o heroin.gaba.pca.png
plotPCA -in heroin.glu.npz -o heroin.glu.pca.png
plotPCA -in all.samples.npz -o all.samples.pca.png
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/all.samples.pca.png .
plotCorrelation --corData control.olig.npz --colorMap RdYlBu --plotNumbers --corMethod pearson --whatToPlot heatmap --plotFile control.olig.cor.heatmap.png --outFileCorMatrix control.olig.cor.tab
plotCorrelation --corData all.samples.npz --colorMap RdYlBu --plotNumbers --corMethod pearson --whatToPlot heatmap --plotFile all.samples.cor.heatmap.png --outFileCorMatrix PearsonCorrScores.tab
plotCorrelation --corData all.samples.npz --corMethod pearson --whatToPlot scatterplot --plotFile all.samples.cor.scatter.png
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/control.olig.cor.heatmap.png .
setwd("/Users/27n/Dropbox\ \(ORNL\)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
control.olig <- read.delim("OLIG-control-sig5.narrowPeak", header=F, sep="\t")
control.gaba <- read.delim("GABA-control-sig5.narrowPeak", header=F, sep="\t")
control.glu <- read.delim("GLU-control-sig5.narrowPeak", header=F, sep="\t")
heroin.olig <- read.delim("OLIG-heroin-sig5.narrowPeak", header=F, sep="\t")
heroin.gaba <- read.delim("GABA-heroin-sig5.narrowPeak", header=F, sep="\t")
heroin.glu <- read.delim("GLU-heroin-sig5.narrowPeak", header=F, sep="\t")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(control.olig) <- c("chr", "start", "end", "control.olig.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(control.gaba) <- c("chr", "start", "end", "control.gaba.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(control.glu) <- c("chr", "start", "end", "control.glu.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(heroin.gaba) <- c("chr", "start", "end", "heroin.gaba.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(heroin.glu) <- c("chr", "start", "end", "heroin.glu.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(heroin.olig) <- c("chr", "start", "end", "heroin.olig.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(gene) <- c("chr", "source", "type", "start", "end", "dot1", "strand", "dot2", "id")
length(unique(control.olig$control.olig.peak))
# 32137
length(unique(control.gaba$control.gaba.peak))
# 22315
length(unique(control.glu$control.glu.peak))
# 39544
length(unique(heroin.olig$heroin.olig.peak))
# 49925
length(unique(heroin.gaba$heroin.gaba.peak))
# 31102
length(unique(heroin.glu$heroin.glu.peak))
# 45727
library(tidygenomics)
library(dplyr)
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("IRanges")
# peaks overlapping genes
control.olig.gene <- genome_intersect(control.olig, gene, by=c("chr", "start", "end"))
# 24376 / 32137 = 0.7585027
control.gaba.gene <- genome_intersect(control.gaba, gene, by=c("chr", "start", "end"))
# 17079 / 22315 = 0.7653596
control.glu.gene <- genome_intersect(control.glu, gene, by=c("chr", "start", "end"))
# 28188 / 39544 = 0.7128262
heroin.olig.gene <- genome_intersect(heroin.olig, gene, by=c("chr", "start", "end"))
# 35968 / 49925 = 0.7204407
heroin.gaba.gene <- genome_intersect(heroin.gaba, gene, by=c("chr", "start", "end"))
# 23021 / 31102 = 0.7401775
heroin.glu.gene <- genome_intersect(heroin.glu, gene, by=c("chr", "start", "end"))
# 31953 / 45727 = 0.6987775
# peaks in proximal regions (<2kb from gene)
gene.up <- gene
gene.up$end <- gene.up$start
gene.up$start <- gene.up$start - 2000
gene.down <- gene
gene.down$start <- gene.down$end
gene.down$end <- gene.down$end + 2000
gene.proximal <- rbind(gene.up, gene.down)
control.olig.prox <- genome_intersect(control.olig, gene.proximal, by=c("chr", "start", "end"))
# 10826 / 32137 = 0.3368703
control.gaba.prox <- genome_intersect(control.gaba, gene.proximal, by=c("chr", "start", "end"))
# 8856 / 22315 = 0.3968631
control.glu.prox <- genome_intersect(control.glu, gene.proximal, by=c("chr", "start", "end"))
# 10543 / 39544 = 0.2666144
heroin.olig.prox <- genome_intersect(heroin.olig, gene.proximal, by=c("chr", "start", "end"))
# 12702 / 49925 = 0.2544216
heroin.gaba.prox <- genome_intersect(heroin.gaba, gene.proximal, by=c("chr", "start", "end"))
# 10096 / 31102 = 0.3246093
heroin.glu.prox <- genome_intersect(heroin.glu, gene.proximal, by=c("chr", "start", "end"))
# 10787 / 45727 = 0.2359
# peaks genic (not proximal... aka: not overlapping the edge of the gene)
control.olig.gene.noprox <- anti_join(control.olig.gene, control.olig.prox, by="control.olig.peak")
# 14496 / 32137 = 0.4510689
control.gaba.gene.noprox <- anti_join(control.gaba.gene, control.gaba.prox, by="control.gaba.peak")
# 8840 / 22315 = 0.3961461
control.glu.gene.noprox <- anti_join(control.glu.gene, control.glu.prox, by="control.glu.peak")
# 18690 / 39544 = 0.4726381
heroin.olig.gene.noprox <- anti_join(heroin.olig.gene, heroin.olig.prox, by="heroin.olig.peak")
# 25005 / 49925 = 0.5008513
heroin.gaba.gene.noprox <- anti_join(heroin.gaba.gene, heroin.gaba.prox, by="heroin.gaba.peak")
# 13795 / 31102 = 0.4435406
heroin.glu.gene.noprox <- anti_join(heroin.glu.gene, heroin.glu.prox, by="heroin.glu.peak")
# 22428 / 45727 = 0.4904761
# plot distribution of peaks relative to gene annotations
library(ggplot2)
library(reshape2)
library(RColorBrewer)
df <- data.frame(control.olig = c(45.1, 33.7, 100-45.1-33.7),
control.gaba = c(39.6, 39.6, 100-39.6-39.6), control.glu = c(47.3, 26.7, 100-47.3-26.7),
heroin.olig = c(50.1, 25.4, 100-50.1-25.4), heroin.gaba = c(44.4, 32.5, 100-44.4-32.5),
heroin.glu = c(49, 23.6, 100-49-23.6))
df$annotation <- c("genic", "proximal (<2kb)", "distal (>2kb)")
df.melt <- melt(df, id="annotation")
ggplot(df.melt, aes(x=variable, y=value, fill=annotation)) + geom_bar(stat="identity") + theme_classic() + scale_fill_brewer(palette="Dark2") + xlab("") + ylab("Proportion of Peak with Significance value > 5")
# peaks unique to control or heroin samples
olig.intersect <- genome_intersect(control.olig, heroin.olig, by=c("chr", "start", "end"))
control.olig.uniq <- anti_join(control.olig, olig.intersect, by="control.olig.peak")
# 3784
heroin.olig.uniq <- anti_join(heroin.olig, olig.intersect, by="heroin.olig.peak")
# 21001
gaba.intersect <- genome_intersect(control.gaba, heroin.gaba, by=c("chr", "start", "end"))
control.gaba.uniq <- anti_join(control.gaba, gaba.intersect, by="control.gaba.peak")
# 4334
heroin.gaba.uniq <- anti_join(heroin.gaba, gaba.intersect, by="heroin.gaba.peak")
# 13153
glu.intersect <- genome_intersect(control.glu, heroin.glu, by=c("chr", "start", "end"))
control.glu.uniq <- anti_join(control.glu, glu.intersect, by="control.glu.peak")
# 9240
heroin.glu.uniq <- anti_join(heroin.glu, glu.intersect, by="heroin.glu.peak")
# 15325
write.table(control.olig.uniq, "control.olig.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(heroin.olig.uniq, "heroin.olig.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(control.gaba.uniq, "control.gaba.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(heroin.gaba.uniq, "heroin.gaba.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(control.glu.uniq, "control.glu.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(heroin.glu.uniq, "heroin.glu.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
# Plot of number of sig>5 peaks that are unique (differential) versus shared
df <- data.frame(peaks = c(32137, 49925, 22315, 31102, 39544, 45727),
differential.peaks = c(3784, 21001, 4334, 13153, 9240, 15325))
df$condition <- c("control", "heroin", "control", "heroin", "control", "heroin")
df$tissue <- c("olig", "olig", "gaba", "gaba", "glu", "glu")
df.melt <- melt(df, id=c("condition", "tissue"))
ggplot(df.melt, aes(x=condition, y=value, fill=variable)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ tissue) + theme_classic() + scale_fill_brewer(palette="Paired") + xlab("") + ylab("Number of Peaks with Significance value > 5")
df$prop <- df$differential.peaks / df$peaks
df.prop <- df[,3:5]
df.prop.melt <- melt(df.prop, id=c("condition", "tissue"))
ggplot(df.prop.melt, aes(x=tissue, y=value, fill=condition)) + geom_bar(stat="identity", position="dodge") + theme_classic() + scale_fill_brewer(palette="Paired") + xlab("") + ylab("Proportion of Peaks (sig > 5) that are unique to that sample")
df.count <- subset(df.melt, df.melt$variable == "peaks")
ggplot(df.count, aes(x=condition, y=value, fill=tissue)) + geom_bar(stat="identity", position="dodge") + theme_classic() + scale_fill_brewer(palette="Set1") + xlab("") + ylab("Number of Peaks with Significance value > 5")
# R
library(dplyr)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/bins")
file_list <- list.files(path="/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/bins")
## need to delete any empty files in path... find -empty -type f -delete
for (file in file_list){
if (!exists("dataset")){
dataset <- read.table(file, header=FALSE, sep="\t")[,1:4]
}
if (exists("dataset")){
temp_dataset <-read.table(file, header=FALSE, sep="\t", stringsAsFactors=FALSE)
dataset <- full_join(dataset, temp_dataset[,1:4], by=c("V1", "V2", "V3"))
rm(temp_dataset)
}
}
dataset.coord <- dataset[,1:3]
dataset.file <- dataset[,4:ncol(dataset)]
colnames(dataset.coord) <- c("chr", "start", "end")
file_list_df <- data.frame(file_list)
file_list_id <- separate(file_list_df, file_list, c("sample", "id"), sep="_")
colnames(dataset.file) <- file_list_id$sample
dataset <- cbind(dataset.coord, dataset.file)
write.table(dataset, "bin.matrix.txt", quote=F, row.names=F, sep="\t")
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/bins/bin.matrix.txt
dataset <- read.delim("bin.matrix.txt", header=T, sep="\t", stringsAsFactors=FALSE)
dataset.coord <- dataset[,1:3]
dataset.file <- dataset[,4:ncol(dataset)]
library(edgeR)
dataset.cpm <- cpm(dataset.file, normalized.lib.sizes=TRUE, log=FALSE)
dataset <- cbind(dataset.coord, dataset.cpm)
write.table(dataset, "bin.cpm.matrix.id.txt", quote=F, row.names=F, col.names=F, sep="\t")
dataset.cpm <- read.delim("bin.cpm.matrix.id.txt", header=T, sep="\t", stringsAsFactors=FALSE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac")
sample.id <- read.delim("atac.csaw.key1.rmdups.txt", header=T, sep="\t")
sample.id2 <- separate(sample.id, sample.name, c("sample", "id"), sep="_")
library(reshape2)
df <- melt(dataset.cpm)
colnames(df) <- c("var", "sample", "cpm")
df.id <- left_join(df, sample.id2[,c(1,3)], by="sample")
df.id.dcast <- dcast(df.id, var ~ condition, value.var = "cpm")
df.id.coord <- cbind(dataset.coord, df.id.dcast)
write.table(df.id, "bin.cpm.matrix.condition.txt", quote=F, row.names=F, col.names=F, sep="\t")
# grep 'condition.olig' bin.cpm.matrix.condition.txt > control.olig.matrix.txt
# grep 'condition.gaba' bin.cpm.matrix.condition.txt > control.gaba.matrix.txt
# grep 'condition.glu' bin.cpm.matrix.condition.txt > control.glu.matrix.txt
# grep 'heroin.olig' bin.cpm.matrix.condition.txt > heroin.olig.matrix.txt
# grep 'heroin.gaba' bin.cpm.matrix.condition.txt > heroin.gaba.matrix.txt
# grep 'heroin.glu' bin.cpm.matrix.condition.txt > heroin.glu.matrix.txt
# remove lines that sum to zero
awk '{s=0; for (i=4;i<=NF;i++) s+=$i; if (s!=0)print}' bin.cpm.matrix.id.txt > bin.cpm.matrix.id.nonzero.txt
cat bin.matrix.header.txt bin.cpm.matrix.id.txt > bin.cpm.matrix.id.header.txt
cat bin.matrix.header.txt bin.cpm.matrix.id.nonzero.txt > bin.cpm.matrix.id.nonzero.header.txt
# calculate mean and variance of each sample...
# remove background noise? anything that has < XX reads is removed...
# get correlation between samples to plot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
dataset <- read.delim("bin.cpm.matrix.id.nonzero.header.txt", header=T, sep="\t")
library(ggcorrplot)
corr <- round(cor(dataset), 1)
p.mat <- cor_pmat(dataset)
pdf("cor.plot.pdf")
ggcorrplot(corr, hc.order = TRUE, outline.col = "white")
dev.off()
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
# module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins
# grep 'NC_000001.11' bin.cpm.matrix.id.header.txt > bin.cpm.matrix.id.chr1.txt
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
dataset <- read.delim("bin.cpm.matrix.id.chr1.txt", header=F, sep="\t")
library(matrixStats)
df <- dataset[,4:183]
df.mat <- as.matrix(df)
colVars(df.mat)
# [1] 0.004053502 0.004053502 0.011056247 0.018592847 0.012452881 0.008509306
# [7] 0.005249184 0.011354367 0.005893361 0.019110815 0.012042861 0.011646257
# [13] 0.014584874 0.029569877 0.008006736 0.018105243 0.029793770 0.007780984
# [19] 0.013583174 0.020894215 0.011593435 0.017424278 0.005050321 0.009251329
# [25] 0.003576256 0.015154273 0.003289923 0.015868166 0.018564106 0.012205283
# [31] 0.014667055 0.005607420 0.007599597 0.012104677 0.005689803 0.011029179
# [37] 0.007902484 0.016873441 0.006526545 0.014664839 0.011240874 0.008890407
# [43] 0.008826054 0.012668842 0.018095545 0.011199368 0.011902554 0.013192940
# [49] 0.014065711 0.009269166 0.007953365 0.010129546 0.009623983 0.006744169
# [55] 0.007175339 0.018009467 0.010075941 0.019968244 0.009290152 0.006304366
# [61] 0.011965395 0.013831849 0.007111501 0.025395312 0.010018092 0.006115925
# [67] 0.010777257 0.013043650 0.011122409 0.016978451 0.020424907 0.029169264
# [73] 0.018115568 0.007945271 0.025072477 0.017262127 0.005660722 0.013334929
# [79] 0.010875450 0.016433534 0.055192492 0.014784930 0.015242399 0.013191587
# [85] 0.008155960 0.010075247 0.006717925 0.082462442 0.013033944 0.024770656
# [91] 0.021168053 0.032781571 0.028825297 0.016757315 0.026613437 0.023923968
# [97] 0.022617936 0.017338097 0.017278328 0.029042681 0.017319972 0.012599141
# [103] 0.033193204 0.017453541 0.012271060 0.046364085 0.016285712 0.009409680
# [109] 0.013839821 0.022327914 0.010294686 0.022102128 0.038725300 0.030113437
# [115] 0.083468618 0.022656921 0.013152188 0.027173805 0.013844974 0.020471371
# [121] 0.018824317 0.019049510 0.013133139 0.013478444 0.036851339 0.019792723
# [127] 0.023919815 0.013872570 0.014047843 0.011729129 0.050354066 0.024975908
# [133] 0.028366837 0.011524052 0.010771832 0.027241128 0.030468805 0.017263853
# [139] 0.010576263 0.022849244 0.014652274 0.024086708 0.055714172 0.034357201
# [145] 0.014979666 0.036424006 0.025253960 0.015741784 0.032154539 0.025752913
# [151] 0.022979376 0.013249374 0.016356116 0.015110519 0.005357184 0.010296026
# [157] 0.012009264 0.028730258 0.020365601 0.026947440 0.036971693 0.030687947
# [163] 0.033451924 0.022031313 0.013372611 0.010169133 0.013685180 0.009564272
# [169] 0.013439565 0.026048278 0.030853795 0.010508498 0.016393417 0.009306690
# [175] 0.020081407 0.014651140 0.080890619 0.014665290 0.013265721 0.033913111
colMedians(df.mat)
# [1] 0.02207059 0.02207059 0.02143704 0.02310836 0.02528641 0.02427585
# [7] 0.02821143 0.02529540 0.02764340 0.02206199 0.02515361 0.02326442
# [13] 0.02552782 0.02605143 0.02187698 0.02435632 0.02884006 0.02578085
# [19] 0.02788538 0.03217848 0.02643577 0.01972474 0.02239056 0.02438889
# [25] 0.02755772 0.02761453 0.02793814 0.02873711 0.01832672 0.02436200
# [31] 0.02505114 0.02479486 0.02044886 0.02529428 0.02361347 0.02609186
# [37] 0.02880739 0.02290379 0.02060680 0.02466077 0.02679300 0.01992951
# [43] 0.02530046 0.02026770 0.01836490 0.02139904 0.01787567 0.02007888
# [49] 0.02211116 0.02777082 0.02962552 0.01950526 0.02251428 0.02609464
# [55] 0.02735177 0.02849040 0.02654447 0.01681043 0.02249424 0.02244720
# [61] 0.02466621 0.01938553 0.02216041 0.02629048 0.02489163 0.02716874
# [67] 0.02750771 0.02162227 0.02610229 0.02053712 0.01999462 0.02019991
# [73] 0.02668628 0.02814576 0.02313843 0.02340327 0.02448685 0.02094626
# [79] 0.02482051 0.02015845 0.02393200 0.01867882 0.01500242 0.01670947
# [85] 0.03159126 0.02515981 0.02677749 0.00000000 0.01635651 0.00000000
# [91] 0.01484555 0.02083045 0.01768195 0.02174991 0.01861795 0.01844058
# [97] 0.01806907 0.01883807 0.01870480 0.02061788 0.01960684 0.01863857
# [103] 0.02020517 0.02307588 0.02005915 0.00000000 0.02325398 0.01934935
# [109] 0.02349966 0.01821213 0.02278083 0.01982209 0.02030717 0.02159790
# [115] 0.00000000 0.01946784 0.02010897 0.01737422 0.02128435 0.02151362
# [121] 0.00000000 0.02006185 0.02004255 0.01899946 0.02746989 0.01913028
# [127] 0.02386091 0.01919040 0.00000000 0.02176137 0.02269661 0.01812294
# [133] 0.00000000 0.01783428 0.01870736 0.01757439 0.02282973 0.02448003
# [139] 0.02052728 0.02014527 0.01884404 0.01928192 0.02146785 0.01735938
# [145] 0.02074923 0.02108892 0.01837509 0.01781071 0.02236569 0.02172414
# [151] 0.02071907 0.01605044 0.02061845 0.01757925 0.02739311 0.02027802
# [157] 0.00000000 0.02498180 0.01357315 0.00000000 0.02247384 0.02193963
# [163] 0.02164550 0.02530392 0.02016127 0.02180651 0.01829089 0.01968607
# [169] 0.01758644 0.01731975 0.00000000 0.02247339 0.02143366 0.01789547
# [175] 0.02319626 0.02260586 0.00000000 0.01661858 0.01561362 0.00000000
colQuantiles(df.mat)
# 0% 25% 50% 75% 100%
# V4 0 0.011035296 0.02207059 0.04965883 14.00379
# V5 0 0.011035296 0.02207059 0.04965883 14.00379
# V6 0 0.008574814 0.02143704 0.04287407 29.76747
# V7 0 0.011554182 0.02310836 0.04621673 71.50883
# V8 0 0.012643205 0.02528641 0.04425122 54.09395
# V9 0 0.012137924 0.02427585 0.04855170 34.99970
# V10 0 0.011284572 0.02821143 0.05078058 28.13808
# V11 0 0.012647702 0.02529540 0.05059081 37.83560
# V12 0 0.011057358 0.02764340 0.04975811 24.88458
# V13 0 0.007353998 0.02206199 0.04412399 71.24554
# V14 0 0.012576805 0.02515361 0.04087462 53.45771
# V15 0 0.011632211 0.02326442 0.04652884 34.19870
# how many of the samples have a value > 10 for that bin...
df$SamplesGr10 <- rowSums(df>10)
# 2489565
nrow(subset(df, df$SamplesGr10 == 0))
# 2488967 / 2489565 = 0.9997598
nrow(subset(df, df$SamplesGr10 > 0))
# 598
# how many of the samples have a value > 1 for that bin...
df$SamplesGr1 <- rowSums(df>1)
# 2489565
nrow(subset(df, df$SamplesGr1 == 0))
# 2456688 / 2489565 = 0.9867941
nrow(subset(df, df$SamplesGr1 > 0))
# 32877 / 2489565 = 0.01320592
######### remove bins where sum < 1
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
dataset <- read.delim("bin.cpm.matrix.id.header.txt", header=F, sep="\t")
# 30882712
library(matrixStats)
df <- dataset[,4:183]
df$SamplesGr1 <- rowSums(df>1)
df.gr1 <- subset(df, df$SamplesGr1 > 0)
df.gr1$row <- rownames(df.gr1)
dataset$row <- rownames(dataset)
library(dplyr)
df.gr1.coord <- left_join(df.gr1, dataset[,c(1:3,184)], by="row")
df.gr1.coord2 <- df.gr1.coord[,c(183,184,185,1:180)]
# 352039 / 30882712 = 0.01139923 = 1.1%
write.table(df.gr1.coord2, "bin.cpm.matrix.id.gr1.txt", quote=F, row.names=F, col.names=F, sep="\t")
# cat bin.matrix.header.txt bin.cpm.matrix.id.gr1.txt > bin.cpm.matrix.id.gr1.header.txt
colQuantiles(as.matrix(df.gr1.coord[,181]))
# 0% 25% 50% 75% 100%
# 1 1 3 6 180
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(tidyr)
library(devtools)
library(ggbiplot)
# sed 's/-/./g' atac.metadata.txt > atac.metadata.point.txt
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("atac.metadata.point.txt", header=F, sep="\t")
colnames(df) <- c("id", "condition")
df2 <- separate(df, id, c("sample", "seq"), sep="_")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
#dataset <- read.delim("bin.cpm.matrix.id.txt", header=F, sep="\t")
#dataset <- read.delim("bin.matrix.txt", header=F, sep="\t")
dataset <- read.delim("bin.cpm.matrix.id.gr1.header.txt", header=T, sep="\t", stringsAsFactors = F)
df.num <- as.matrix(dataset[,4:ncol(dataset)])
samples <- colnames(dataset[,4:ncol(dataset)])
key <- data.frame(sample = samples)
key.condition <- left_join(key, df2, by="sample")
key.pca <- key.condition$condition
df.pca <- prcomp(t(df.num), scale. = TRUE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
pdf("macs.pca.pdf")
ggbiplot(df.pca, obs.scale = 1, var.scale = 1,
groups = key.pca, ellipse = TRUE, circle = TRUE) +
theme(legend.direction = 'horizontal', legend.position = 'top')
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.pca.pdf .
PC <- data.frame(df.pca$x)
PC$sample <- rownames(PC)
PC.label <- left_join(PC, key.condition, by="sample")
PC.plot <- PC.label[,c(1:180,183)]
pdf("macs.pca2.pdf")
ggplot(PC.plot,aes(x=PC1,y=PC2,col=condition))+
geom_point(size=3,alpha=0.5) +
theme_classic()
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.pca2.pdf .
pdf("macs.pca3.pdf")
ggplot(PC.plot,aes(x=PC2,y=PC3,col=condition))+
geom_point(size=3,alpha=0.5) +
theme_classic()
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.pca3.pdf .
pdf("macs.pca4.pdf")
ggplot(PC.plot,aes(x=PC1,y=PC3,col=condition))+
geom_point(size=3,alpha=0.5) +
theme_classic()
dev.off()
# salloc -A SYB105 -p gpu -N 1 -t 4:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(tidyr)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("atac.metadata.point.txt", header=F, sep="\t")
colnames(df) <- c("id", "condition")
df2 <- separate(df, id, c("sample", "seq"), sep="_")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
dataset <- read.delim("bin.cpm.matrix.id.gr1.header.txt", header=T, sep="\t", stringsAsFactors = F)
df.num <- as.matrix(dataset[,4:ncol(dataset)])
samples <- colnames(dataset[,4:ncol(dataset)])
key <- data.frame(sample = samples)
key.condition <- left_join(key, df2, by="sample")
key.pca <- key.condition$condition
library(ggcorrplot)
corr <- round(cor(df.num), 1)
p.mat <- cor_pmat(df.num)
pdf("cor.plot.pdf")
ggcorrplot(corr, hc.order = TRUE, outline.col = "white", lab_size = 1)
dev.off()
pdf("cor.plot.sig.pdf")
ggcorrplot(corr, hc.order = TRUE, type = "lower", p.mat = p.mat)
dev.off()
df.num.cond <- df.num
colnames(df.num.cond) <- key.pca
corr.lab <- round(cor(df.num.cond), 1)
p.mat.lab <- cor_pmat(df.num.cond)
pdf("cor.plot.lab.pdf")
ggcorrplot(corr.lab, hc.order = TRUE, outline.col = "white", lab_size = 1)
dev.off()
pdf("cor.plot.sig.lab.pdf")
ggcorrplot(corr.lab, hc.order = TRUE, type = "lower", p.mat = p.mat.lab)
dev.off()
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output
mkdir -p peaks || exit 1
for pathname in "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output"/*/opioid.atac_peaks.narrowPeak; do
cp "$pathname" "peaks/$( basename "$( dirname "$pathname" )" ).bed"
done
gzip /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks/*.bed
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/macs.output/merge
mkdir -p peaks || exit 1
for pathname in "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/macs.output/merge"/*/opioid.atac_peaks.narrowPeak; do
cp "$pathname" "peaks/$( basename "$( dirname "$pathname" )" ).txt"
done
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/atac.metadata.txt
# sample.id group.id
# GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1 control.gaba
# GABA-288_TAGGCATG-TATCCTCT_HGHM2DSXY_L003_001.R1 control.gaba
# GABA-372_GGACTCCT-TCGACTAG_HGHM2DSXY_L003_001.R1 control.gaba
# GABA-405_CGAGGCTG-GAGCCTTA_HGHM2DSXY_L004_001.R1 control.gaba
# GABA-406_CGTACTAG-TCTCTCCG_HGHM2DSXY_L004_001.R1 control.gaba
# GABA-430_AAGAGGCA-CTATTAAG_HGHM2DSXY_L004_001.R1 control.gaba
# ... 179 lines
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/atac.metadata.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
name = fields[0]
key = fields[1]
#append = '''awk 'BEGIN{FS="\t"};{print $0,''' + key + "}'" + " " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.output/peaks/" + name + ".macs2.txt" + " " + ">" + " " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.output/peaks/" + name + ".macs2.append.txt"
#append = '''print ''' + key + "}'" + " " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.output/peaks/" + name + ".macs2.txt" + " " + ">" + " " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.output/peaks/" + name + ".macs2.append.txt"
append = '''awk '{print $0",""''' + key + '''"}''' + "' " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.output/peaks/" + name + ".macs2.txt" + " " + ">" + " " + "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.output/peaks/" + name + ".macs2.append.csv"
print(append)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
python scripts/macs.append.py > commands/macs.append.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name append --time 02:00:00 --maxpernode 20 --nodes 2 commands/macs.append.txt
diffbind: http://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
make sample sheet
## [1] "SampleID" "Tissue" "Factor" "Condition" "Treatment"
## [6] "Replicate" "bamReads" "ControlID" "bamControl" "Peaks"
## [11] "PeakCaller"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("atac.metadata.txt", header=F, sep="\t")
library(dplyr)
library(tidyr)
df2 <- separate(df, V1, c("Sample.ID", "SampleSeq"), sep="_")
df3 <- separate(df2, V2, c("Condition", "Tissue"))
df3$PeakName <- df$V1
df3$PeakCaller <- "bed"
df3$PeaksDir <- "bwa.output/macs.output/peaks/"
df3$PeaksExt <- ".macs2.bed.gz"
df3$BamName <- df$V1
df3$BamDir <- "bwa.output/rmdups.bam/"
df3$BamExt <- ".rmdups.bam"
df4 <- unite(df3, Peaks, c(PeaksDir, PeakName, PeaksExt), sep="")
df5 <- unite(df4, bamReads, c(BamDir, BamName, BamExt), sep="")
df5$Factor <- NA
df5$Treatment <- df5$Condition
df5$Replicate <- 1
df5$Control.ID <- NA
df5$bamControl <- NA
df.sample <- df5[,c(1,4,8,3,9,10,7,11,12,5,6)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
write.csv(df.sample, file = 'diffbind.sample.csv', row.names=F)
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.sample.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.sample.csv")
head(df$peaks[[1]])
# Chr Start End Score
# 1 NC_000001.11 9983 10505 0.0317054912
# 2 NC_000001.11 16157 16485 0.0044475759
# 3 NC_000001.11 180742 181601 0.0130344797
# 4 NC_000001.11 182702 182905 0.0008366727
# 5 NC_000001.11 191194 191903 0.0464133163
# 6 NC_000001.11 629844 630066 0.0539874059
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.heatmap.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.counted.heatmap.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_TISSUE, label=DBA_ID)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.differential.heatmap.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.differential.heatmap2.pdf")
dba.plotHeatmap(tamoxifen.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.plotMA.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.plotVolcano.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.plotPCA.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.plotBox.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks_report.csv", sep="\t", quote=F, row.names=F)
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/dba.heatmap.pdf .
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
head -1 key.files/diffbind.sample.csv > key.files/diffbind.header.csv
grep 'olig' key.files/diffbind.sample.csv | cat key.files/diffbind.header.csv - > key.files/diffbind.sample.olig.csv
grep 'gaba' key.files/diffbind.sample.csv | cat key.files/diffbind.header.csv - > key.files/diffbind.sample.gaba.csv
grep 'glu' key.files/diffbind.sample.csv | cat key.files/diffbind.header.csv - > key.files/diffbind.sample.glu.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.sample.olig.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.sample.olig.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.heatmap.olig.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.counted.heatmap.olig.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.counted.pca.olig.pdf")
dba.plotPCA(df.counted, attributes=DBA_CONDITION, label=DBA_CONDITION)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
#Both methods simultaneously
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition heroin 29 control 30 23393 2
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2
# 1 Condition heroin 29 control 30
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.differential.venn.olig.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("dba.differential.heatmap.olig.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output")
pdf("merge.dba.differential.heatmap2.olig.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("merge.dba.plotMA.olig.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("merge.dba.plotVolcano.olig.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("merge.dba.plotPCA.olig.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("merge.dba.plotBox.olig.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks_report_olig.csv", sep="\t", quote=F, row.names=F)
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/dba.heatmap.olig.pdf .
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(dplyr)
library(tidyr)
# counts matrix file
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/bins")
#cts.in <- read.table("bin.matrix.txt", header=T, sep="\t")
cts.in <- read.table("bin.cpm.matrix.id.gr1.header.txt", header=T, sep="\t")
cst.id <- unite(cts.in, id, c(chr, start, end), sep="_", remove=FALSE)
cts <- as.matrix(cts.id, row.names="id")
# sample / condition file
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
sample <- read.delim("atac.csaw.key1.rmdups.txt", header=T, sep="\t")
sample.id <- separate(sample.id, sample.name, c("sample", "seq"), sep="_")
coldata <- sample.id[,c(1,3)]
rownames(coldata) <- coldata$sample
coldata$condition <- factor(coldata$condition)
rownames(coldata) <- rownames(coldata)
all(rownames(coldata) %in% colnames(cts))
cts <- cts[, rownames(coldata)]
all(rownames(coldata) == colnames(cts))
library("DESeq2")
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ condition)
dds
write.table(dds, "deseq.dds.txt", quote=F, row.names=F, sep="\t")
## submit as job on summit
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
#/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge --time 02:00:00 --maxpernode 1 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.commands.txt
# awk 'NR >= 1 && NR <= 3'
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge --time 02:00:00 --maxpernode 1 --nodes 3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.commands1.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge --time 02:00:00 --maxpernode 1 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.commands2.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge.single --time 02:00:00 --maxpernode 1 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.commands.single.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge.index --time 02:00:00 --maxpernode 1 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.index.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge.bed --time 02:00:00 --maxpernode 20 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.bed.commands.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name merge.macs2 --time 48:00:00 --maxpernode 4 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/merge.macs2.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bamtobigwig --time 48:00:00 --maxpernode 4 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/merge.bamtobigwig.commands.txt
#### Running out of time on index generation in summit...
# Andes
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#conda install -c bioconda hmmratac
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.merge --time 24:00:00 --maxpernode 2 --nodes 6 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.andes.commands.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/merge/bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.merge.index --time 24:00:00 --maxpernode 2 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.index.andes.commands.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bamtobigwig --time 48:00:00 --maxpernode 4 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/merge.bamtobigwig.commands.txt
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(tidyr)
#set the working directory from which the files will be read from
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/macs.output/merge/csv")
#create a list of the files from your target directory
file_list <- list.files(path="/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/macs.output/merge/csv")
#initiate a blank data frame, each iteration of the loop will append the data from the given file to this variable
for (file in file_list){
# if the merged dataset doesn't exist, create it
if (!exists("dataset")){
dataset <- read.table(file, header=FALSE, sep="\t")
}
# if the merged dataset does exist, append to it
if (exists("dataset")){
temp_dataset <-read.table(file, header=FALSE, sep="\t", stringsAsFactors=FALSE)
dataset <- rbind(dataset, temp_dataset)
rm(temp_dataset)
}
}
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref")
chr <- read.delim("chr.txt", header=F, sep="\t")
dataset.chr <- subset(dataset, dataset$V1 %in% chr$V1)
dataset.chr$chrom <- substr(dataset.chr$V1, 8, 9)
dataset1 <- dataset.chr[,c(11,2,3,10)]
colnames(dataset1) <- c("chrom", "start", "end", "label")
dataset2 <- separate(dataset1, "label", c("val", "key"), sep=",")
df <- as.data.frame(dataset2[,c(1:3,5)])
df$chr <- as.numeric(df$chrom)
df.num <- df[,c(5,2,3)]
key <- dataset2$key
library(devtools)
library(ggbiplot)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.pca <- prcomp(df.num, scale. = TRUE)
pdf("macs.merge.pca.pdf")
ggbiplot(df.pca, obs.scale = 1, var.scale = 1,
groups = key, ellipse = TRUE, circle = TRUE) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
dev.off()
library(ggfortify)
pca_res <- prcomp(df.num, scale. = TRUE)
pdf("macs.merge.pca2.pdf")
autoplot(pca_res, data = dataset2, colour = 'key')
dev.off()
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs.merge.pca.pdf ~/Downloads/.
### merged peaks
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/macs.output/merge/* .
setwd("/Users/27n/Dropbox\ \(ORNL\)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
control.olig <- read.delim("OLIG-control-merge.narrowPeak", header=F, sep="\t")
control.gaba <- read.delim("GABA-control-merge.narrowPeak", header=F, sep="\t")
control.glu <- read.delim("GLU-control-merge.narrowPeak", header=F, sep="\t")
heroin.olig <- read.delim("OLIG-heroin-merge.narrowPeak", header=F, sep="\t")
heroin.gaba <- read.delim("GABA-heroin-merge.narrowPeak", header=F, sep="\t")
heroin.glu <- read.delim("GLU-heroin-merge.narrowPeak", header=F, sep="\t")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(control.olig) <- c("chr", "start", "end", "control.olig.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(control.gaba) <- c("chr", "start", "end", "control.gaba.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(control.glu) <- c("chr", "start", "end", "control.glu.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(heroin.gaba) <- c("chr", "start", "end", "heroin.gaba.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(heroin.glu) <- c("chr", "start", "end", "heroin.glu.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(heroin.olig) <- c("chr", "start", "end", "heroin.olig.peak", "val", "strand", "sig", "pval", "qval", "val2")
colnames(gene) <- c("chr", "source", "type", "start", "end", "dot1", "strand", "dot2", "id")
length(unique(control.olig$control.olig.peak))
# 753438
length(unique(control.gaba$control.gaba.peak))
# 601649
length(unique(control.glu$control.glu.peak))
# 506564
length(unique(heroin.olig$heroin.olig.peak))
# 659813
length(unique(heroin.gaba$heroin.gaba.peak))
# 610903
length(unique(heroin.glu$heroin.glu.peak))
# 489561
library(tidygenomics)
library(dplyr)
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("IRanges")
# peaks overlapping genes
control.olig.gene <- genome_intersect(control.olig, gene, by=c("chr", "start", "end"))
control.gaba.gene <- genome_intersect(control.gaba, gene, by=c("chr", "start", "end"))
control.glu.gene <- genome_intersect(control.glu, gene, by=c("chr", "start", "end"))
heroin.olig.gene <- genome_intersect(heroin.olig, gene, by=c("chr", "start", "end"))
heroin.gaba.gene <- genome_intersect(heroin.gaba, gene, by=c("chr", "start", "end"))
heroin.glu.gene <- genome_intersect(heroin.glu, gene, by=c("chr", "start", "end"))
# peaks in proximal regions (<2kb from gene)
gene.up <- gene
gene.up$end <- gene.up$start
gene.up$start <- gene.up$start - 2000
gene.down <- gene
gene.down$start <- gene.down$end
gene.down$end <- gene.down$end + 2000
gene.proximal <- rbind(gene.up, gene.down)
control.olig.prox <- genome_intersect(control.olig, gene.proximal, by=c("chr", "start", "end"))
# 36239 / 753438 = 0.04809818
control.gaba.prox <- genome_intersect(control.gaba, gene.proximal, by=c("chr", "start", "end"))
# 28854 / 601649 = 0.04795819
control.glu.prox <- genome_intersect(control.glu, gene.proximal, by=c("chr", "start", "end"))
# 25010 / 506564 = 0.04937185
heroin.olig.prox <- genome_intersect(heroin.olig, gene.proximal, by=c("chr", "start", "end"))
# 32128 / 659813 = 0.04869258
heroin.gaba.prox <- genome_intersect(heroin.gaba, gene.proximal, by=c("chr", "start", "end"))
# 29003 / 610903 = 0.04747562
heroin.glu.prox <- genome_intersect(heroin.glu, gene.proximal, by=c("chr", "start", "end"))
# 24615 / 489561 = 0.05027974
# only use "significant" peaks...
control.olig.sig <- subset(control.olig, control.olig$sig > 3)
control.gaba.sig <- subset(control.gaba, control.gaba$sig > 3)
control.glu.sig <- subset(control.glu, control.glu$sig > 3)
heroin.olig.sig <- subset(heroin.olig, heroin.olig$sig > 3)
heroin.gaba.sig <- subset(heroin.gaba, heroin.gaba$sig > 3)
heroin.glu.sig <- subset(heroin.glu, heroin.glu$sig > 3)
# 69326, 45246, 56868, 56011, 39271, 47274
control.olig.gene <- genome_intersect(control.olig.sig, gene, by=c("chr", "start", "end"))
control.gaba.gene <- genome_intersect(control.gaba.sig, gene, by=c("chr", "start", "end"))
control.glu.gene <- genome_intersect(control.glu.sig, gene, by=c("chr", "start", "end"))
heroin.olig.gene <- genome_intersect(heroin.olig.sig, gene, by=c("chr", "start", "end"))
heroin.gaba.gene <- genome_intersect(heroin.gaba.sig, gene, by=c("chr", "start", "end"))
heroin.glu.gene <- genome_intersect(heroin.glu.sig, gene, by=c("chr", "start", "end"))
control.olig.prox <- genome_intersect(control.olig.sig, gene.proximal, by=c("chr", "start", "end"))
control.gaba.prox <- genome_intersect(control.gaba.sig, gene.proximal, by=c("chr", "start", "end"))
control.glu.prox <- genome_intersect(control.glu.sig, gene.proximal, by=c("chr", "start", "end"))
heroin.olig.prox <- genome_intersect(heroin.olig.sig, gene.proximal, by=c("chr", "start", "end"))
heroin.gaba.prox <- genome_intersect(heroin.gaba.sig, gene.proximal, by=c("chr", "start", "end"))
heroin.glu.prox <- genome_intersect(heroin.glu.sig, gene.proximal, by=c("chr", "start", "end"))
# 8019, 5413, 6200, 7229, 5448, 5817
# peaks genic (not proximal... aka: not overlapping the edge of the gene)
control.olig.gene.noprox <- anti_join(control.olig.gene, control.olig.prox, by="control.olig.peak")
control.gaba.gene.noprox <- anti_join(control.gaba.gene, control.gaba.prox, by="control.gaba.peak")
control.glu.gene.noprox <- anti_join(control.glu.gene, control.glu.prox, by="control.glu.peak")
heroin.olig.gene.noprox <- anti_join(heroin.olig.gene, heroin.olig.prox, by="heroin.olig.peak")
heroin.gaba.gene.noprox <- anti_join(heroin.gaba.gene, heroin.gaba.prox, by="heroin.gaba.peak")
heroin.glu.gene.noprox <- anti_join(heroin.glu.gene, heroin.glu.prox, by="heroin.glu.peak")
# 35282, 21988, 29875, 28752, 19176, 24779
## genic regions that are exonic...
setwd("/Users/27n/Dropbox\ \(ORNL\)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
gene.exon <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.exon.gtf", header=F, sep="\t")
colnames(gene.exon) <- c("chr", "source", "type", "start", "end", "dot1", "strand", "dot2", "id")
control.olig.exon <- genome_intersect(control.olig.gene.noprox, gene.exon, by=c("chr", "start", "end"))
control.gaba.exon <- genome_intersect(control.gaba.gene.noprox, gene.exon, by=c("chr", "start", "end"))
control.glu.exon <- genome_intersect(control.glu.gene.noprox, gene.exon, by=c("chr", "start", "end"))
heroin.olig.exon <- genome_intersect(heroin.olig.gene.noprox, gene.exon, by=c("chr", "start", "end"))
heroin.gaba.exon <- genome_intersect(heroin.gaba.gene.noprox, gene.exon, by=c("chr", "start", "end"))
heroin.glu.exon <- genome_intersect(heroin.glu.gene.noprox, gene.exon, by=c("chr", "start", "end"))
# 5586, 3539, 4987, 5196, 3414, 4314 --> only 16-18% of the "genic" peaks are in exons
# plot distribution of peaks relative to gene annotations
library(ggplot2)
library(reshape2)
library(RColorBrewer)
df <- data.frame(control.olig = c(35282, 8019, 69326-8019-35282, 69326),
control.gaba = c(21988, 5413, 45246-5413-21988, 45246), control.glu = c(29875, 6200, 56868-6200-29875, 56868),
heroin.olig = c(28752, 7229, 56011-7229-28752, 56011), heroin.gaba = c(19176, 5448, 39271-5448-19176, 39271),
heroin.glu = c(24779, 5817, 39271-5817-24779, 39271))
df$annotation <- c("genic", "proximal (<2kb)", "distal (>2kb)", "total")
df.melt <- melt(df[1:3,], id="annotation")
ggplot(df.melt, aes(x=variable, y=value, fill=annotation)) + geom_bar(stat="identity") + theme_classic() + scale_fill_brewer(palette="Dark2") + xlab("") + ylab("Number of Peaks with Significance value > 3")
df$annotation <- c("genic", "proximal", "distal", "total")
df.melt <- melt(df, id="annotation")
df.dcast <- dcast(df.melt, variable ~ annotation)
df.prop <- df.dcast %>% mutate(Annotation = variable, Genic = genic/total, Proximal = proximal/total, Distal = distal/total)
df.melt <- melt(df.prop[,c(6:9)], id="Annotation")
ggplot(df.melt, aes(x=Annotation, y=value, fill=variable)) + geom_bar(stat="identity") + theme_classic() + scale_fill_brewer(palette="Dark2") + xlab("") + ylab("Proportion of Peaks with Significance value > 3")
# peaks unique to control or heroin samples
olig.intersect <- genome_intersect(control.olig.sig, heroin.olig.sig, by=c("chr", "start", "end"))
control.olig.uniq <- anti_join(control.olig.sig, olig.intersect, by="control.olig.peak")
# 32290
heroin.olig.uniq <- anti_join(heroin.olig.sig, olig.intersect, by="heroin.olig.peak")
# 19042
gaba.intersect <- genome_intersect(control.gaba.sig, heroin.gaba.sig, by=c("chr", "start", "end"))
control.gaba.uniq <- anti_join(control.gaba.sig, gaba.intersect, by="control.gaba.peak")
# 15262
heroin.gaba.uniq <- anti_join(heroin.gaba.sig, gaba.intersect, by="heroin.gaba.peak")
# 9246
glu.intersect <- genome_intersect(control.glu.sig, heroin.glu.sig, by=c("chr", "start", "end"))
control.glu.uniq <- anti_join(control.glu.sig, glu.intersect, by="control.glu.peak")
# 16416
heroin.glu.uniq <- anti_join(heroin.glu.sig, glu.intersect, by="heroin.glu.peak")
# 6806
write.table(control.olig.uniq, "control.olig.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(heroin.olig.uniq, "heroin.olig.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(control.gaba.uniq, "control.gaba.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(heroin.gaba.uniq, "heroin.gaba.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(control.glu.uniq, "control.glu.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
write.table(heroin.glu.uniq, "heroin.glu.uniq.peak.bed", quote=F, row.names=F, col.names=F, sep="\t")
# Plot of number of sig>5 peaks that are unique (differential) versus shared
df <- data.frame(peaks = c(69326, 56011, 45246, 39271, 56868, 47274),
differential.peaks = c(32290, 19042, 15262, 9246, 16416, 6806))
df$condition <- c("control", "heroin", "control", "heroin", "control", "heroin")
df$tissue <- c("olig", "olig", "gaba", "gaba", "glu", "glu")
df.melt <- melt(df, id=c("condition", "tissue"))
ggplot(df.melt, aes(x=condition, y=value, fill=variable)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ tissue) + theme_classic() + scale_fill_brewer(palette="Paired") + xlab("") + ylab("Number of Peaks with Significance value > 3")
df$prop <- df$differential.peaks / df$peaks
df.prop <- df[,3:5]
df.prop.melt <- melt(df.prop, id=c("condition", "tissue"))
ggplot(df.prop.melt, aes(x=tissue, y=value, fill=condition)) + geom_bar(stat="identity", position="dodge") + theme_classic() + scale_fill_brewer(palette="Paired") + xlab("") + ylab("Proportion of Peaks (sig > 3) that are unique to that sample")
df.count <- subset(df.melt, df.melt$variable == "peaks")
ggplot(df.count, aes(x=condition, y=value, fill=tissue)) + geom_bar(stat="identity", position="dodge") + theme_classic() + scale_fill_brewer(palette="Set1") + xlab("") + ylab("Number of Peaks with Significance value > 3")
# histogram of distance to gene
control.olig.dist <- genome_join_closest(control.olig.sig, gene, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
control.gaba.dist <- genome_join_closest(control.gaba.sig, gene, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
control.glu.dist <- genome_join_closest(control.glu.sig, gene, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
heroin.olig.dist <- genome_join_closest(heroin.olig.sig, gene, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
heroin.gaba.dist <- genome_join_closest(heroin.gaba.sig, gene, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
heroin.glu.dist <- genome_join_closest(heroin.glu.sig, gene, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
control.olig.dist.uniq <- unique(control.olig.dist[,c(4,19,20)])
hist(control.olig.dist.uniq$distance)
control.olig.dist.distal <- subset(control.olig.dist.uniq, abs(control.olig.dist.uniq$distance) > 2000)
hist(control.olig.dist.distal$distance)
nrow(control.olig.dist.distal)
# 26024
nrow(subset(control.olig.dist.distal, control.olig.dist.distal$distance < 5000))
# 1796 2-5kb
# 2744 5-10kb
# 2175 10-15kb
# 1739 15-20kb
# 17570 >20kb
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files/atac.key.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
d = fields[0]
r1 = fields[1]
r2 = fields[2]
hmmratac = "java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam -i " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/bedgraph/" + r1 + " -e ref/hg38-blacklist.v2.ensembl.bed --bedgraph TRUE"
print(hmmratac)
# Andes
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#conda install -c bioconda hmmratac
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
python scripts/hmmratac.py > commands/hmmratac.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 10 --nodes 10 commands/hmmratac.commands.txt
# filter peaks: awk -v OFS="\t" '$13>=10 {print}' NAME_peaks.gappedPeak > NAME.filteredPeaks.gappedPeak
** Adding –bedgraph results in an additional file with all regions classified by state (only useful if model indicates a state different from default is best for predicting true peak regions… otherwise reports far more data and uses more memory/time than necessary) - cancelled part way through job running on all samples…
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 2 --nodes 1 commands/hmmratac.olig276.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 2 --nodes 1 commands/hmmratac.olig276.adjust.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 2 --nodes 1 commands/hmmratac.olig276.adjust.low.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 2 --nodes 1 commands/hmmratac.olig276.adjust.lowlow.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 2 --nodes 1 commands/hmmratac.olig276.adjust.high.commands.txt
java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam -i bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/OLIG-276-default -e ref/hg38-blacklist.v2.ensembl.bed --bedgraph TRUE
# 29271 OLIG-276-default_peaks.gappedPeak
java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam -i bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/OLIG-276-adjusted -e ref/hg38-blacklist.v2.ensembl.bed -l 10 -u 30 --bedgraph TRUE
# 29154 OLIG-276-adjusted_peaks.gappedPeak
## why are there fewer peaks called with the adjusted criteria?
# try making less stringent (lower values = greater recall)
# low
java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam -i bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/OLIG-276-adjusted-low -e ref/hg38-blacklist.v2.ensembl.bed -l 1 -u 10 --bedgraph TRUE
# 756
# lowlow
java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam -i bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/OLIG-276-adjusted-lowlow -e ref/hg38-blacklist.v2.ensembl.bed -l 5 -u 20 --bedgraph TRUE
### maybe I should be making the thresholds higher instead of lower??
java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam -i bwa.output/rmdups.bam/OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.rmdups.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/OLIG-276-adjusted-high -e ref/hg38-blacklist.v2.ensembl.bed -l 20 -u 50 --bedgraph TRUE
24978 GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
35946 GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
29605 GABA-309_ACTGAGCG-TATCCTCT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
33019 GABA-316_ACTGAGCG-CGTCTAAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
7197 GABA-322_AGGCAGAA-TTATGCGA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
8998 GABA-339_AGGCAGAA-CTATTAAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
22306 GABA-340_ACTGAGCG-AAGGAGTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
20397 GABA-345_TAAGGCGA-TTATGCGA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
37167 GABA-357_TAGCGCTC-ACTGCATA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
33216 GABA-368_CGTACTAG-AAGGAGTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
17625 GABA-370_TAAGGCGA-CCTAGAGT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
32621 GABA-372_GGACTCCT-TCGACTAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
42167 GABA-377_TGCAGCTA-CTCTCTAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
13427 GABA-383_CGTACTAG-GCGTAAGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
24101 GABA-405_CGAGGCTG-GAGCCTTA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
19921 GABA-406_CGTACTAG-TCTCTCCG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
4048 GABA-408_CGAGGCTG-CCTAGAGT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
42863 GABA-430_AAGAGGCA-CTATTAAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
22070 GABA-444_AGGCAGAA-TCGACTAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
24348 GABA-S00-0255_ATGCGCAG-CCTAGAGT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
9555 GABA-S03-0019_AAGAGGCA-GTAAGGAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
6642 GABA-S05-0105_ACTGAGCG-GCGTAAGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
** 1200 GABA-S07-0156_TAGCGCTC-TTATGCGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
5624 GABA-S08-0061_ACTGAGCG-TTATGCGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
34458 GABA-S11-0310_ACTGAGCG-CCTAGAGT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
37889 GABA-S13-0098_AAGAGGCA-CTAAGCCT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
47429 GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
53330 GLU-288_TCCTGAGC-GAGCCTTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
41842 GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
56354 GLU-316_ACTGAGCG-CTCTCTAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
17383 GLU-322_AGGCAGAA-CCTAGAGT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
42969 GLU-339_TAAGGCGA-TCGACTAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
50501 GLU-340_TCCTGAGC-GCGTAAGA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
36522 GLU-345_TAAGGCGA-AAGGCTAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
54585 GLU-349_GGAGCTAC-CTAAGCCT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
56699 GLU-357_TAGCGCTC-CTAAGCCT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
56679 GLU-368_GCGTAGTA-AAGGAGTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
66176 GLU-370_CGTACTAG-AAGGCTAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
74771 GLU-372_GGACTCCT-CTATTAAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
38576 GLU-377_TGCAGCTA-GTAAGGAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
58165 GLU-383_TCCTGAGC-GCGTAAGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
35126 GLU-405_CGAGGCTG-TTCTAGCT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
25215 GLU-408_CGAGGCTG-AAGGCTAT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
37936 GLU-430_CTCTCTAC-CTATTAAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
64292 GLU-444_AGGCAGAA-GAGCCTTA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
28425 GLU-S03-0019_AAGAGGCA-ACTGCATA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
10777 GLU-S05-0105_ACTGAGCG-TTCTAGCT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
33016 GLU-S05-0252_TACGCTGC-TCGACTAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
** 2597 GLU-S07-0156_TAGCGCTC-GAGCCTTA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
36428 GLU-S08-0061_TGCAGCTA-GCGTAAGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
47537 GLU-S13-0098_AAGAGGCA-AAGGCTAT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
29267 OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
26539 OLIG-288_TCCTGAGC-TCTCTCCG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
22552 OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
18276 OLIG-316_ACTGAGCG-TCTCTCCG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
11919 OLIG-322_AGGCAGAA-TTCTAGCT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
17346 OLIG-339_ACTGAGCG-GTAAGGAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
11944 OLIG-340_GTAGAGGA-TTCTAGCT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
18357 OLIG-345_TAAGGCGA-CTATTAAG_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
23139 OLIG-349_GGAGCTAC-AAGGAGTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
21539 OLIG-357_TAGCGCTC-AAGGAGTA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
24189 OLIG-368_CGTACTAG-CGTCTAAT_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
21678 OLIG-370_GTAGAGGA-TTATGCGA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed
21447 OLIG-372_ACTGAGCG-ACTGCATA_HGHM2DSXY_L003_001.R1_peaks.gappedPeak.col.bed <-- 350M reads
27738 OLIG-383_TCGACGTC-AAGGAGTA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
17687 OLIG-405_CGAGGCTG-TCGACTAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
32669 OLIG-408_CGAGGCTG-TTATGCGA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
18352 OLIG-413_TAAGGCGA-ACTGCATA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
26204 OLIG-430_CTCTCTAC-TTCTAGCT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
26778 OLIG-444_CGAGGCTG-CTATTAAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
17457 OLIG-S00-0255_ACTGAGCG-TCGACTAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
6736 OLIG-S03-0019_AAGAGGCA-AAGGAGTA_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
8646 OLIG-S05-0105_ACTGAGCG-AAGGCTAT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
6776 OLIG-S05-0252_TACGCTGC-CCTAGAGT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
** 960 OLIG-S07-0156_ATGCGCAG-TCGACTAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
9322 OLIG-S08-0061_TGCAGCTA-TTCTAGCT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
14175 OLIG-S11-0310_TAGCGCTC-AAGGCTAT_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
18987 OLIG-S13-0098_GTAGAGGA-TCGACTAG_HGHM2DSXY_L004_001.R1_peaks.gappedPeak.col.bed
34138 pool-A-257-GABA_TCGACGTC-TTATGCGA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
60925 pool-A-257-GLU_TCGACGTC-AAGGCTAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
18831 pool-A-257-OLIG_TCGACGTC-GAGCCTTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
16751 pool-A-260-GABA_TAAGGCGA-TCTCTCCG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
22795 pool-A-260-GLU_TAAGGCGA-CGTCTAAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
16506 pool-A-260-OLIG_ATCTCAGG-GTAAGGAG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
23127 pool-A-265-GABA_AGGCAGAA-CTAAGCCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
50752 pool-A-265-GLU_CGAGGCTG-CTAAGCCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
14259 pool-A-265-OLIG_ATCTCAGG-CGTCTAAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
48402 pool-A-286-GABA_TAAGGCGA-CTAAGCCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
47920 pool-A-286-GLU_TCCTGAGC-CTAAGCCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
23473 pool-A-286-OLIG_GGACTCCT-CTAAGCCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
24044 pool-A-319-GABA_TCGACGTC-CTATTAAG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
35512 pool-A-319-GLU_CGATCAGT-GAGCCTTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
* 19192 pool-A-319-OLIG_CGATCAGT-TTATGCGA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
28863 pool-A-332-GABA_TAGGCATG-TTCTAGCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
32400 pool-A-332-GLU_CCTAAGAC-GAGCCTTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
14984 pool-A-332-OLIG_GCGTAGTA-CGTCTAAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
16381 pool-A-338-GABA_TAGCGCTC-TATCCTCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
30617 pool-A-338-GLU_GGAGCTAC-TCTCTCCG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
15663 pool-A-338-OLIG_TAGCGCTC-ACTGCATA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
24854 pool-A-365-GABA_CGGAGCCT-CTAAGCCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
24184 pool-A-365-GLU_CGGAGCCT-CTCTCTAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
* 438 pool-A-365-OLIG_CGGAGCCT-TCTCTCCG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
22177 pool-A-376-GABA_CTCTCTAC-CCTAGAGT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
30239 pool-A-376-GLU_CTCTCTAC-TTATGCGA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
43756 pool-A-376-OLIG_CGGAGCCT-TATCCTCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
19523 pool-A-387-GABA_TACGCTGC-CGTCTAAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
34279 pool-A-387-GLU_CGGAGCCT-CGTCTAAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
* 3624 pool-A-387-OLIG_CGGAGCCT-AAGGAGTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
15469 pool-A-394-GABA_CTCTCTAC-GAGCCTTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
8495 pool-A-394-GLU_CTCTCTAC-TCGACTAG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
25062 pool-A-394-OLIG_GCTCATGA-TTCTAGCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
14885 pool-A-395-GABA_TCGACGTC-ACTGCATA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
36898 pool-A-395-GLU_TCGACGTC-CGTCTAAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
21470 pool-A-395-OLIG_TCGACGTC-CTCTCTAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
27998 pool-A-398-GABA_CCTAAGAC-TTATGCGA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
* 384 pool-A-398-GLU_GTAGAGGA-GCGTAAGA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
18506 pool-A-398-OLIG_GTAGAGGA-CTATTAAG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
33139 pool-A-407-GABA_CCTAAGAC-CTCTCTAT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
46412 pool-A-407-GLU_TACGCTGC-AAGGAGTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
* 10989 pool-A-407-OLIG_ATGCGCAG-AAGGAGTA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
30722 pool-A-427-GABA_ATCTCAGG-TCTCTCCG_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
40735 pool-A-427-GLU_ATCTCAGG-ACTGCATA_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
30083 pool-A-427-OLIG_ATCTCAGG-TATCCTCT_H3W5FDSXY_L001_001.R1_peaks.gappedPeak.col.bed
11139 pool-B-255-GABA_CGATCAGT-AAGGCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
16386 pool-B-255-GLU_TGCAGCTA-TTATGCGA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
8279 pool-B-255-OLIG_CGATCAGT-CTATTAAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
15706 pool-B-310-GABA_TACGCTGC-TATCCTCT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
13572 pool-B-310-GLU_TACGCTGC-ACTGCATA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
27692 pool-B-310-OLIG_TACGCTGC-GTAAGGAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
21642 pool-B-325-GABA_GTAGAGGA-TTCTAGCT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
24397 pool-B-325-GLU_ATGCGCAG-ACTGCATA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
16621 pool-B-325-OLIG_GTAGAGGA-AAGGCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
24539 pool-B-333-GABA_CGATCAGT-GTAAGGAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
34420 pool-B-333-GLU_CGATCAGT-TCTCTCCG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
25149 pool-B-333-OLIG_CGATCAGT-CTCTCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
24766 pool-B-336-GABA_AAGAGGCA-TCGACTAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
26838 pool-B-336-GLU_AAGAGGCA-GAGCCTTA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
27293 pool-B-336-OLIG_TGCAGCTA-CGTCTAAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
35839 pool-B-343-GABA_CCTAAGAC-AAGGCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
42922 pool-B-343-GLU_CCTAAGAC-CTATTAAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
29780 pool-B-343-OLIG_CCTAAGAC-AAGGAGTA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
10193 pool-B-354-GABA_CGTACTAG-TTATGCGA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
17651 pool-B-354-GLU_CGTACTAG-CTATTAAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
* 505 pool-B-354-OLIG_TAGGCATG-GCGTAAGA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
** 33262 pool-B-375-GABA_ACTCGCTA-CTAAGCCT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
** 30870 pool-B-375-GLU_TAGCGCTC-GTAAGGAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
** 38584 pool-B-375-OLIG_TAGCGCTC-CTCTCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
10972 pool-B-382-GABA_TAAGGCGA-GTAAGGAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
12597 pool-B-382-GLU_TAAGGCGA-TATCCTCT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
19159 pool-B-382-OLIG_ATCTCAGG-CTAAGCCT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
12367 pool-B-385-GABA_GCTCATGA-CTATTAAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
15460 pool-B-385-GLU_AAGAGGCA-CCTAGAGT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
19344 pool-B-385-OLIG_AAGAGGCA-TTATGCGA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
21807 pool-B-386-GABA_TGCAGCTA-GAGCCTTA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
17917 pool-B-386-GLU_TGCAGCTA-CTATTAAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
31831 pool-B-386-OLIG_TGCAGCTA-AAGGCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
28784 pool-B-388-GABA_TAGGCATG-CTCTCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
* 18264 pool-B-388-GLU_TCCTGAGC-CTCTCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
9396 pool-B-389-GABA_GGAGCTAC-ACTGCATA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
29540 pool-B-389-GLU_GGAGCTAC-CTCTCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
20914 pool-B-389-OLIG_GGAGCTAC-TATCCTCT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
** 37373 pool-B-397-GABA_ACTCGCTA-TCTCTCCG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
** 35959 pool-B-397-GLU_ACTCGCTA-AAGGAGTA_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
** 1894 pool-B-397-OLIG_ACTCGCTA-CGTCTAAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
22809 pool-B-401-GABA_ATGCGCAG-CTCTCTAT_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
32879 pool-B-401-GLU_CGTACTAG-TCGACTAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
* 18302 pool-B-401-OLIG_ATGCGCAG-GTAAGGAG_H3W5FDSXY_L002_001.R1_peaks.gappedPeak.col.bed
KEY:
* <30M reads mapped
* poor mapping rate
--> some of the low peak samples (<2000) are from low mapping rate samples but a few arent and not sure why they are so noisy
--> what to do about the middle bunch (5000-12000)??
# remove samples
GABA-S07, GLU-S07, OLIG-S07, pool-A-319-OLIG, pool-A-365-OLIG, pool-A-387-OLIG, pool-A-407-OLIG, pool-A-398-GLU, pool-B-354-OLIG, pool-B-375-GABA, pool-B-375-GLU, pool-B-375-OLIG, pool-B-388-GLU, pool-B-397-GABA, pool-B-397-GLU, pool-B-397-OLIG, pool-B-401-OLIG
## 17 samples --> 10%
# create directory with only grappedPeak files and rename with .bed extension
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac
mkdir -p peaks
cp *_peaks.gappedPeak peaks/.
cd peaks
find . -name "*.gappedPeak" -exec sh -c 'mv "$1" "${1%.gappedPeak}.bed"' _ {} \;
# create directory with only peak files and subset to only include columns 1-4 (chr, start, end, peak)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac
mkdir -p peaks.coord
for i in *_peaks.gappedPeak; do
cut -f 1-4 $i > $i.col.bed
done
mv *_peaks.gappedPeak.col.bed peaks.coord/.
## [1] "SampleID" "Tissue" "Factor" "Condition" "Treatment"
## [6] "Replicate" "bamReads" "ControlID" "bamControl" "Peaks"
## [11] "PeakCaller"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("atac.metadata.txt", header=F, sep="\t")
library(dplyr)
library(tidyr)
df2 <- separate(df, V1, c("Sample.ID", "SampleSeq"), sep="_")
df3 <- separate(df2, V2, c("Condition", "Tissue"))
df3$PeakName <- df$V1
df3$PeakCaller <- "bed"
df3$PeaksDir <- "bwa.output/hmmratac/peaks.coord/"
df3$PeaksExt <- "_peaks.gappedPeak.col.bed"
df3$BamName <- df$V1
df3$BamDir <- "bwa.output/rmdups.bam/"
df3$BamExt <- ".rmdups.bam"
df4 <- unite(df3, Peaks, c(PeaksDir, PeakName, PeaksExt), sep="")
df5 <- unite(df4, bamReads, c(BamDir, BamName, BamExt), sep="")
df5$Factor <- NA
df5$Treatment <- df5$Condition
df5$Replicate <- 1
df5$Control.ID <- NA
df5$bamControl <- NA
df.sample <- df5[,c(1,4,8,3,9,10,7,11,12,5,6)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
write.csv(df.sample, file = 'diffbind.hmmratac.sample.csv', row.names=F)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
head -1 key.files/diffbind.hmmratac.sample.csv > key.files/diffbind.hmmratac.header.csv
grep 'olig' key.files/diffbind.hmmratac.sample.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.olig.csv
grep 'gaba' key.files/diffbind.hmmratac.sample.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.gaba.csv
grep 'glu' key.files/diffbind.hmmratac.sample.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.glu.csv
grep 'control' key.files/diffbind.hmmratac.sample.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.control.csv
# remove samples 406, 309, 377 --> subset.csv
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.control.subset.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.sample.control.subset.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.control.heatmap.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 gaba control control 1 42725768 0.07
# 2 2 gaba control control 1 43704543 0.09
# 3 3 gaba control control 1 46220300 0.07
# 4 4 gaba control control 1 42498778 0.06
# 5 5 gaba control control 1 33952735 0.15
# 6 6 gaba control control 1 43848567 0.06
# 7 7 gaba control control 1 37167868 0.09
# 8 8 gaba control control 1 34548832 0.05
# 9 9 gaba control control 1 43780400 0.04
# 10 10 gaba control control 1 16734968 0.05
# 11 11 gaba control control 1 11692172 0.08
# 12 12 gaba control control 1 36431902 0.13
# 13 13 gaba control control 1 45130302 0.10
# 14 14 glu control control 1 46493165 0.10
# 15 15 glu control control 1 44525174 0.11
# 16 16 glu control control 1 37405883 0.18
# 17 17 glu control control 1 33724862 0.11
# 18 18 glu control control 1 35094142 0.12
# 19 19 glu control control 1 50605114 0.14
# 20 20 glu control control 1 33465855 0.09
# 21 21 glu control control 1 31493600 0.05
# 22 22 glu control control 1 34340810 0.07
# 23 23 glu control control 1 14101921 0.07
# 24 24 glu control control 1 37350754 0.09
# 25 25 glu control control 1 35822337 0.11
# 26 26 olig control control 1 35419836 0.12
# 27 27 olig control control 1 39985688 0.08
# 28 28 olig control control 1 18410024 0.15
# 29 29 olig control control 1 33169147 0.07
# 30 30 olig control control 1 35730552 0.11
# 31 31 olig control control 1 30796132 0.11
# 32 32 olig control control 1 12721318 0.10
# 33 33 olig control control 1 14676769 0.07
# 34 34 olig control control 1 43746962 0.04
# 35 35 olig control control 1 17095336 0.06
# 36 36 olig control control 1 14587566 0.07
# 37 37 olig control control 1 12936951 0.09
# 38 38 olig control control 1 11352906 0.12
# 39 39 olig control control 1 15112980 0.12
# 40 40 gaba control control 1 23520169 0.15
# 41 41 glu control control 1 27296926 0.18
# 42 42 olig control control 1 21742002 0.11
# 43 43 gaba control control 1 25494066 0.08
# 44 44 glu control control 1 25484620 0.09
# 45 45 olig control control 1 26727988 0.07
# 46 46 gaba control control 1 24408560 0.13
# 47 47 glu control control 1 25647366 0.17
# 48 48 olig control control 1 21531922 0.08
# 49 49 gaba control control 1 36244265 0.17
# 50 50 glu control control 1 25086894 0.16
# 51 51 olig control control 1 22677852 0.16
# 52 52 gaba control control 1 21035405 0.12
# 53 53 glu control control 1 23904608 0.12
# 54 54 olig control control 1 12134577 0.16
# 55 55 gaba control control 1 28541170 0.14
# 56 56 glu control control 1 25171322 0.12
# 57 57 olig control control 1 19943332 0.09
# 58 58 gaba control control 1 27219006 0.08
# 59 59 glu control control 1 41608044 0.08
# 60 60 olig control control 1 24681970 0.07
# 61 61 gaba control control 1 24313938 0.10
# 62 62 glu control control 1 24159262 0.10
# 63 63 olig control control 1 8340420 0.07
# 64 64 gaba control control 1 26060403 0.11
# 65 65 glu control control 1 24962683 0.10
# 66 66 olig control control 1 27769905 0.16
# 67 67 gaba control control 1 22798424 0.09
# 68 68 glu control control 1 23141216 0.15
# 69 69 olig control control 1 6205234 0.08
# 70 70 gaba control control 1 25728756 0.07
# 71 71 glu control control 1 25038620 0.06
# 72 72 olig control control 1 26965074 0.09
# 73 73 gaba control control 1 18636863 0.09
# 74 74 glu control control 1 24710482 0.12
# 75 75 olig control control 1 20045962 0.12
# 76 76 gaba control control 1 24340360 0.13
# 77 77 glu control control 1 12117088 0.15
# 78 78 olig control control 1 21634187 0.10
# 79 79 gaba control control 1 21645282 0.16
# 80 80 glu control control 1 26730796 0.15
# 81 81 olig control control 1 9552678 0.12
# 82 82 gaba control control 1 27332056 0.12
# 83 83 glu control control 1 25835844 0.14
# 84 84 olig control control 1 26725124 0.15
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.control.counted.heatmap.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.control.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_TISSUE, label=DBA_TISSUE)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_TISSUE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Tissue gaba 28 glu 27 47127 47643
# 2 Tissue gaba 28 olig 29 59345 58671
# 3 Tissue olig 29 glu 27 69392 69227
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.control.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.control.differential.heatmap.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.control.differential.heatmap2.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_TISSUE, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.control.plotMA.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.control.plotVolcano.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.control.plotPCA.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.control.plotBox.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.control.csv", sep="\t", quote=F, row.names=F)
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
sed '1d' key.files/diffbind.hmmratac.sample.gaba.subset.csv > key.files/diffbind.hmmratac.sample.gaba.subset.nohead.csv
sed '1d' key.files/diffbind.hmmratac.sample.glu.subset.csv > key.files/diffbind.hmmratac.sample.glu.subset.nohead.csv
cat key.files/diffbind.hmmratac.sample.olig.subset.csv key.files/diffbind.hmmratac.sample.gaba.subset.nohead.csv key.files/diffbind.hmmratac.sample.glu.subset.nohead.csv > key.files/diffbind.hmmratac.sample.all.subset.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.subset.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.sample.all.subset.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.heatmap.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 olig control control 1 35419836 0.12
# 2 2 olig control control 1 39985688 0.08
# 3 3 olig control control 1 18410024 0.15
# 4 4 olig control control 1 33169147 0.08
# 5 5 olig control control 1 35730552 0.11
# 6 6 olig control control 1 30796132 0.11
# 7 7 olig control control 1 12721318 0.10
# 8 8 olig control control 1 14676769 0.07
# 9 9 olig control control 1 43746962 0.04
# 10 10 olig control control 1 17095336 0.06
# 11 11 olig control control 1 14587566 0.08
# 12 12 olig control control 1 12936951 0.09
# 13 13 olig control control 1 11352906 0.13
# 14 14 olig control control 1 15112980 0.13
# 15 15 olig control control 1 21742002 0.11
# 16 16 olig control control 1 26727988 0.07
# 17 17 olig control control 1 21531922 0.08
# 18 18 olig control control 1 22677852 0.16
# 19 19 olig control control 1 12134577 0.16
# 20 20 olig control control 1 19943332 0.09
# 21 21 olig control control 1 24681970 0.07
# 22 22 olig control control 1 8340420 0.07
# 23 23 olig control control 1 27769905 0.16
# 24 24 olig control control 1 6205234 0.08
# 25 25 olig control control 1 26965074 0.09
# 26 26 olig control control 1 20045962 0.12
# 27 27 olig control control 1 21634187 0.10
# 28 28 olig control control 1 9552678 0.12
# 29 29 olig control control 1 26725124 0.15
# 30 30 olig heroin heroin 1 17678860 0.11
# 31 31 olig heroin heroin 1 33878184 0.07
# 32 32 olig heroin heroin 1 34083352 0.05
# 33 33 olig heroin heroin 1 30492794 0.07
# 34 34 olig heroin heroin 1 18033990 0.08
# 35 35 olig heroin heroin 1 23300071 0.09
# 36 36 olig heroin heroin 1 22856649 0.12
# 37 37 olig heroin heroin 1 23347042 0.15
# 38 38 olig heroin heroin 1 17824007 0.12
# 39 39 olig heroin heroin 1 32371754 0.09
# 40 40 olig heroin heroin 1 37173477 0.08
# 41 41 olig heroin heroin 1 32197804 0.10
# 42 42 olig heroin heroin 1 38069042 0.05
# 43 43 olig heroin heroin 1 23155520 0.05
# 44 44 olig heroin heroin 1 24553068 0.11
# 45 45 olig heroin heroin 1 23520314 0.08
# 46 46 olig heroin heroin 1 26375644 0.10
# 47 47 olig heroin heroin 1 22820588 0.13
# 48 48 olig heroin heroin 1 27431814 0.12
# 49 49 olig heroin heroin 1 11751331 0.08
# 50 50 olig heroin heroin 1 15541901 0.15
# 51 51 olig heroin heroin 1 21521670 0.11
# 52 52 olig heroin heroin 1 22589463 0.08
# 53 53 olig heroin heroin 1 26240104 0.11
# 54 54 olig heroin heroin 1 27822724 0.08
# 55 55 olig heroin heroin 1 8818835 0.17
# 56 56 olig heroin heroin 1 10785076 0.15
# 57 57 gaba control control 1 42725768 0.07
# 58 58 gaba control control 1 43704543 0.09
# 59 59 gaba control control 1 46220300 0.07
# 60 60 gaba control control 1 42498778 0.06
# 61 61 gaba control control 1 33952735 0.15
# 62 62 gaba control control 1 43848567 0.07
# 63 63 gaba control control 1 37167868 0.09
# 64 64 gaba control control 1 34548832 0.05
# 65 65 gaba control control 1 43780400 0.04
# 66 66 gaba control control 1 16734968 0.05
# 67 67 gaba control control 1 11692172 0.08
# 68 68 gaba control control 1 36431902 0.13
# 69 69 gaba control control 1 45130302 0.10
# 70 70 gaba control control 1 23520169 0.15
# 71 71 gaba control control 1 25494066 0.08
# 72 72 gaba control control 1 24408560 0.13
# 73 73 gaba control control 1 36244265 0.17
# 74 74 gaba control control 1 21035405 0.12
# 75 75 gaba control control 1 28541170 0.14
# 76 76 gaba control control 1 27219006 0.08
# 77 77 gaba control control 1 24313938 0.11
# 78 78 gaba control control 1 26060403 0.12
# 79 79 gaba control control 1 22798424 0.09
# 80 80 gaba control control 1 25728756 0.07
# 81 81 gaba control control 1 18636863 0.09
# 82 82 gaba control control 1 24340360 0.13
# 83 83 gaba control control 1 21645282 0.16
# 84 84 gaba control control 1 27332056 0.12
# 85 85 gaba heroin heroin 1 38209635 0.10
# 86 86 gaba heroin heroin 1 35455492 0.10
# 87 87 gaba heroin heroin 1 39350404 0.09
# 88 88 gaba heroin heroin 1 43190608 0.04
# 89 89 gaba heroin heroin 1 38504060 0.05
# 90 90 gaba heroin heroin 1 43133304 0.06
# 91 91 gaba heroin heroin 1 31649714 0.08
# 92 92 gaba heroin heroin 1 41832122 0.11
# 93 93 gaba heroin heroin 1 36760146 0.09
# 94 94 gaba heroin heroin 1 17267758 0.13
# 95 95 gaba heroin heroin 1 39684486 0.12
# 96 96 gaba heroin heroin 1 16119282 0.09
# 97 97 gaba heroin heroin 1 13521178 0.07
# 98 98 gaba heroin heroin 1 21057856 0.07
# 99 99 gaba heroin heroin 1 23836164 0.08
# 100 100 gaba heroin heroin 1 23572849 0.10
# 101 101 gaba heroin heroin 1 22521928 0.10
# 102 102 gaba heroin heroin 1 22137926 0.11
# 103 103 gaba heroin heroin 1 29906670 0.13
# 104 104 gaba heroin heroin 1 25420382 0.06
# 105 105 gaba heroin heroin 1 19021498 0.14
# 106 106 gaba heroin heroin 1 21048088 0.08
# 107 107 gaba heroin heroin 1 20019642 0.08
# 108 108 gaba heroin heroin 1 25704207 0.16
# 109 109 gaba heroin heroin 1 22191212 0.06
# 110 110 gaba heroin heroin 1 20833286 0.12
# 111 111 gaba heroin heroin 1 27920540 0.10
# 112 112 glu control control 1 46493165 0.10
# 113 113 glu control control 1 44525174 0.11
# 114 114 glu control control 1 37405883 0.18
# 115 115 glu control control 1 33724862 0.11
# 116 116 glu control control 1 35094142 0.12
# 117 117 glu control control 1 50605114 0.14
# 118 118 glu control control 1 33465855 0.09
# 119 119 glu control control 1 31493600 0.06
# 120 120 glu control control 1 34340810 0.08
# 121 121 glu control control 1 14101921 0.08
# 122 122 glu control control 1 37350754 0.09
# 123 123 glu control control 1 35822337 0.11
# 124 124 glu control control 1 27296926 0.18
# 125 125 glu control control 1 25484620 0.10
# 126 126 glu control control 1 25647366 0.17
# 127 127 glu control control 1 25086894 0.16
# 128 128 glu control control 1 23904608 0.12
# 129 129 glu control control 1 25171322 0.12
# 130 130 glu control control 1 41608044 0.08
# 131 131 glu control control 1 24159262 0.10
# 132 132 glu control control 1 24962683 0.11
# 133 133 glu control control 1 23141216 0.15
# 134 134 glu control control 1 25038620 0.06
# 135 135 glu control control 1 24710482 0.13
# 136 136 glu control control 1 12117088 0.15
# 137 137 glu control control 1 26730796 0.15
# 138 138 glu control control 1 25835844 0.14
# 139 139 glu heroin heroin 1 34920064 0.11
# 140 140 glu heroin heroin 1 37199359 0.12
# 141 141 glu heroin heroin 1 40031386 0.06
# 142 142 glu heroin heroin 1 39698089 0.11
# 143 143 glu heroin heroin 1 43634936 0.10
# 144 144 glu heroin heroin 1 37132804 0.10
# 145 145 glu heroin heroin 1 52881648 0.09
# 146 146 glu heroin heroin 1 48183170 0.12
# 147 147 glu heroin heroin 1 47100392 0.10
# 148 148 glu heroin heroin 1 44330147 0.15
# 149 149 glu heroin heroin 1 32418334 0.12
# 150 150 glu heroin heroin 1 36583276 0.15
# 151 151 glu heroin heroin 1 39251762 0.08
# 152 152 glu heroin heroin 1 27322639 0.08
# 153 153 glu heroin heroin 1 24944148 0.07
# 154 154 glu heroin heroin 1 27341052 0.10
# 155 155 glu heroin heroin 1 25391272 0.11
# 156 156 glu heroin heroin 1 23019618 0.11
# 157 157 glu heroin heroin 1 22966732 0.17
# 158 158 glu heroin heroin 1 22320260 0.10
# 159 159 glu heroin heroin 1 17425160 0.14
# 160 160 glu heroin heroin 1 21695518 0.09
# 161 161 glu heroin heroin 1 24645000 0.08
# 162 162 glu heroin heroin 1 24291834 0.08
# 163 163 glu heroin heroin 1 12018564 0.17
# 164 164 glu heroin heroin 1 23011422 0.12
# 165 165 glu heroin heroin 1 20594156 0.13
# 166 166 glu heroin heroin 1 28405218 0.12
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.counted.heatmap.pdf")
dba.plotHeatmap(df.counted)
dev.off()
### Differential - Tissue
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_TISSUE, label=DBA_TISSUE)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_TISSUE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap2.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_TISSUE, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.all-cell.plotMA.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotVolcano.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotPCA.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.all-cell.plotBox.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.all-cell.csv", sep="\t", quote=F, row.names=F)
### Differential - Condition
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-condition.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_CONDITION, label=DBA_CONDITION)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition heroin 82 control 84 8164 4178
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-condition.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-condition.differential.heatmap.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-condition.differential.heatmap2.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.all-condition.plotMA.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.all-condition.plotVolcano.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.all-condition.plotPCA.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.all-condition.plotBox.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.all-condition.csv", sep="\t", quote=F, row.names=F)
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# remove outlier samples based on number of mapped reads (>40M) and percentage of reads mapped (>70%)
# remove samples
### GABA-S07, GLU-S07, OLIG-S07, pool-A-319-OLIG, pool-A-365-OLIG, pool-A-387-OLIG, pool-A-407-OLIG, pool-A-398-GLU, pool-B-354-OLIG, pool-B-375-GABA, pool-B-375-GLU, pool-B-375-OLIG, pool-B-388-GLU, pool-B-397-GABA, pool-B-397-GLU, pool-B-397-OLIG, pool-B-401-OLIG
#### 17 samples --> 10%
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.outlier2.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.sample.all.outlier2.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.heatmap.outliers.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 olig control control 1 35419836 0.12
# 2 2 olig control control 1 39985688 0.08
# 3 3 olig control control 1 18410024 0.15
# 4 4 olig control control 1 33169147 0.07
# 5 5 olig control control 1 35730552 0.11
# 6 6 olig control control 1 30796132 0.11
# 7 7 olig control control 1 12721318 0.10
# 8 8 olig control control 1 14676769 0.07
# 9 9 olig control control 1 43746962 0.04
# 10 10 olig control control 1 17095336 0.06
# 11 11 olig control control 1 12936951 0.09
# 12 12 olig control control 1 11352906 0.12
# 13 13 olig control control 1 15112980 0.12
# 14 14 olig control control 1 21742002 0.11
# 15 15 olig control control 1 26727988 0.07
# 16 16 olig control control 1 21531922 0.08
# 17 17 olig control control 1 22677852 0.16
# 18 18 olig control control 1 19943332 0.09
# 19 19 olig control control 1 24681970 0.07
# 20 20 olig control control 1 27769905 0.16
# 21 21 olig control control 1 26965074 0.09
# 22 22 olig control control 1 20045962 0.12
# 23 23 olig control control 1 21634187 0.10
# 24 24 olig control control 1 26725124 0.15
# 25 25 olig heroin heroin 1 17678860 0.11
# 26 26 olig heroin heroin 1 33878184 0.07
# 27 27 olig heroin heroin 1 34083352 0.05
# 28 28 olig heroin heroin 1 30492794 0.06
# 29 29 olig heroin heroin 1 18033990 0.08
# 30 30 olig heroin heroin 1 23300071 0.09
# 31 31 olig heroin heroin 1 22856649 0.12
# 32 32 olig heroin heroin 1 23347042 0.15
# 33 33 olig heroin heroin 1 17824007 0.12
# 34 34 olig heroin heroin 1 32371754 0.09
# 35 35 olig heroin heroin 1 37173477 0.08
# 36 36 olig heroin heroin 1 32197804 0.10
# 37 37 olig heroin heroin 1 38069042 0.05
# 38 38 olig heroin heroin 1 23155520 0.05
# 39 39 olig heroin heroin 1 24553068 0.10
# 40 40 olig heroin heroin 1 23520314 0.08
# 41 41 olig heroin heroin 1 26375644 0.09
# 42 42 olig heroin heroin 1 22820588 0.13
# 43 43 olig heroin heroin 1 27431814 0.12
# 44 44 olig heroin heroin 1 21521670 0.11
# 45 45 olig heroin heroin 1 22589463 0.08
# 46 46 olig heroin heroin 1 26240104 0.11
# 47 47 olig heroin heroin 1 27822724 0.07
# 48 48 gaba control control 1 42725768 0.07
# 49 49 gaba control control 1 43704543 0.09
# 50 50 gaba control control 1 46220300 0.07
# 51 51 gaba control control 1 42498778 0.06
# 52 52 gaba control control 1 33952735 0.15
# 53 53 gaba control control 1 43848567 0.06
# 54 54 gaba control control 1 37167868 0.09
# 55 55 gaba control control 1 34548832 0.05
# 56 56 gaba control control 1 43780400 0.04
# 57 57 gaba control control 1 11692172 0.08
# 58 58 gaba control control 1 36431902 0.13
# 59 59 gaba control control 1 45130302 0.10
# 60 60 gaba control control 1 23520169 0.15
# 61 61 gaba control control 1 25494066 0.08
# 62 62 gaba control control 1 24408560 0.13
# 63 63 gaba control control 1 36244265 0.17
# 64 64 gaba control control 1 21035405 0.12
# 65 65 gaba control control 1 28541170 0.14
# 66 66 gaba control control 1 27219006 0.08
# 67 67 gaba control control 1 24313938 0.10
# 68 68 gaba control control 1 26060403 0.11
# 69 69 gaba control control 1 22798424 0.09
# 70 70 gaba control control 1 25728756 0.07
# 71 71 gaba control control 1 18636863 0.09
# 72 72 gaba control control 1 21645282 0.16
# 73 73 gaba control control 1 27332056 0.12
# 74 74 gaba heroin heroin 1 38209635 0.10
# 75 75 gaba heroin heroin 1 35455492 0.10
# 76 76 gaba heroin heroin 1 39350404 0.08
# 77 77 gaba heroin heroin 1 43190608 0.04
# 78 78 gaba heroin heroin 1 38504060 0.05
# 79 79 gaba heroin heroin 1 43133304 0.06
# 80 80 gaba heroin heroin 1 31649714 0.08
# 81 81 gaba heroin heroin 1 41832122 0.11
# 82 82 gaba heroin heroin 1 36760146 0.09
# 83 83 gaba heroin heroin 1 17267758 0.12
# 84 84 gaba heroin heroin 1 39684486 0.12
# 85 85 gaba heroin heroin 1 16119282 0.09
# 86 86 gaba heroin heroin 1 13521178 0.07
# 87 87 gaba heroin heroin 1 21057856 0.07
# 88 88 gaba heroin heroin 1 23836164 0.08
# 89 89 gaba heroin heroin 1 23572849 0.10
# 90 90 gaba heroin heroin 1 22521928 0.10
# 91 91 gaba heroin heroin 1 22137926 0.11
# 92 92 gaba heroin heroin 1 29906670 0.13
# 93 93 gaba heroin heroin 1 25420382 0.06
# 94 94 gaba heroin heroin 1 21048088 0.08
# 95 95 gaba heroin heroin 1 20019642 0.08
# 96 96 gaba heroin heroin 1 25704207 0.16
# 97 97 gaba heroin heroin 1 22191212 0.06
# 98 98 gaba heroin heroin 1 27920540 0.10
# 99 99 glu control control 1 46493165 0.10
# 100 100 glu control control 1 44525174 0.11
# 101 101 glu control control 1 37405883 0.18
# 102 102 glu control control 1 33724862 0.11
# 103 103 glu control control 1 35094142 0.12
# 104 104 glu control control 1 50605114 0.14
# 105 105 glu control control 1 33465855 0.09
# 106 106 glu control control 1 31493600 0.05
# 107 107 glu control control 1 34340810 0.07
# 108 108 glu control control 1 37350754 0.09
# 109 109 glu control control 1 35822337 0.11
# 110 110 glu control control 1 27296926 0.18
# 111 111 glu control control 1 25484620 0.10
# 112 112 glu control control 1 25647366 0.17
# 113 113 glu control control 1 25086894 0.16
# 114 114 glu control control 1 23904608 0.12
# 115 115 glu control control 1 25171322 0.12
# 116 116 glu control control 1 41608044 0.08
# 117 117 glu control control 1 24159262 0.10
# 118 118 glu control control 1 24962683 0.10
# 119 119 glu control control 1 23141216 0.15
# 120 120 glu control control 1 25038620 0.06
# 121 121 glu control control 1 24710482 0.13
# 122 122 glu control control 1 12117088 0.15
# 123 123 glu control control 1 26730796 0.15
# 124 124 glu control control 1 25835844 0.14
# 125 125 glu heroin heroin 1 34920064 0.11
# 126 126 glu heroin heroin 1 37199359 0.12
# 127 127 glu heroin heroin 1 40031386 0.06
# 128 128 glu heroin heroin 1 39698089 0.11
# 129 129 glu heroin heroin 1 43634936 0.10
# 130 130 glu heroin heroin 1 37132804 0.09
# 131 131 glu heroin heroin 1 52881648 0.08
# 132 132 glu heroin heroin 1 48183170 0.11
# 133 133 glu heroin heroin 1 47100392 0.10
# 134 134 glu heroin heroin 1 44330147 0.15
# 135 135 glu heroin heroin 1 32418334 0.12
# 136 136 glu heroin heroin 1 36583276 0.14
# 137 137 glu heroin heroin 1 39251762 0.07
# 138 138 glu heroin heroin 1 27322639 0.08
# 139 139 glu heroin heroin 1 24944148 0.07
# 140 140 glu heroin heroin 1 27341052 0.10
# 141 141 glu heroin heroin 1 25391272 0.11
# 142 142 glu heroin heroin 1 23019618 0.11
# 143 143 glu heroin heroin 1 22966732 0.17
# 144 144 glu heroin heroin 1 22320260 0.09
# 145 145 glu heroin heroin 1 21695518 0.08
# 146 146 glu heroin heroin 1 24645000 0.08
# 147 147 glu heroin heroin 1 24291834 0.08
# 148 148 glu heroin heroin 1 23011422 0.12
# 149 149 glu heroin heroin 1 28405218 0.12
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.counted.heatmap.outliers.pdf")
dba.plotHeatmap(df.counted)
dev.off()
###### GABA-309 still an outlier??? remove from file
## adjust colors
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.counted.heatmap.outliers2.pdf")
dba.plotHeatmap(df.counted, ColAttributes = DBA_Tissue, RowAttributes = DBA_CONDITION)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
#dba.peakset(df.counted, bRetrieve=TRUE, DataType=DBA_DATA_FRAME, writeFile = "all.outliers.counts.txt")
counts <- dba.peakset(df.counted, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.peakset.counts.txt", sep="\t", quote=F, row.names=F)
### Differential - Tissue
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.counted.pca.outliers.pdf")
dba.plotPCA(df.counted, attributes=DBA_TISSUE, label=DBA_CONDITION)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_TISSUE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Tissue olig 47 gaba 51 71402 71575
# 2 Tissue olig 47 glu 51 81000 81138
# 3 Tissue glu 51 gaba 51 60603 60914
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.venn.outliers.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap.outliers.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap2.outliers.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_TISSUE, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.all-cell.plotMA.outliers.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotVolcano.outliers.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotPCA.outliers.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.all-cell.plotBox.outliers.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.all-cell.outliers.csv", sep="\t", quote=F, row.names=F)
### Differential - Condition
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.counted.pca.outliers.condition.pdf")
dba.plotPCA(df.counted, attributes=DBA_CONDITION, label=DBA_TISSUE)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
#1 Condition heroin 73 control 76 331 707
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- dba.peakset(df.analysed, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.peakset.differential.condition.counts.txt", sep="\t", quote=F, row.names=F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.venn.outliers.condition.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap.outliers.condition.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap2.outliers.condition.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.all-cell.plotMA.outliers.condition.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotVolcano.outliers.condition.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotPCA.outliers.condition.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.all-cell.plotBox.outliers.condition.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.all-cell.outliers.condition.csv", sep="\t", quote=F, row.names=F)
### Differential - Condition + Tissue
df.counted <- dba.contrast(df.counted, categories=c(DBA_CONDITION,DBA_TISSUE))
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition control 76 heroin 73 3282 3744
# 2 Tissue olig 47 gaba 51 71424 71618
# 3 Tissue olig 47 glu 51 81048 81210
# 4 Tissue glu 51 gaba 51 60685 61012
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- dba.peakset(df.analysed, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.peakset.differential.counts.txt", sep="\t", quote=F, row.names=F)
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION + DBA_TISSUE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- dba.peakset(df.analysed, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.peakset.differential.condition.tissue.counts.txt", sep="\t", quote=F, row.names=F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.venn.outliers.condition.tissue.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap.outliers.condition.tissue.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap2.outliers.condition.tissue.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = c(DBA_CONDITION, DBA_TISSUE), contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.all-cell.plotMA.outliers.condition.tissue.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotVolcano.outliers.condition.tissue.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotPCA.outliers.condition.tissue.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.all-cell.plotBox.outliers.condition.tissue.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.all.outliers.condition.tissue.csv", sep="\t", quote=F, row.names=F)
https://alexslemonade.github.io/refinebio-examples/03-rnaseq/dimension-reduction_rnaseq_02_umap.html
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(DESeq2)
library(umap)
library(ggplot2)
library(magrittr)
library(dplyr)
# Set the seed so our results are reproducible:
set.seed(12345)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:152]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
final_annotated_umap_plot <- ggplot(umap_plot_df,aes(x = X1,y = X2,color = Condition,shape = Tissue)) + geom_point() # make a scatterplot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
## remove gaba and olig outlier samples
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.outlier.csv")
samples.df <- samples[]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:152]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
final_annotated_umap_plot <- ggplot(umap_plot_df,aes(x = X1,y = X2,color = Condition,shape = Tissue)) + geom_point() # make a scatterplot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
#### Differential
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.differential.condition.tissue.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:152]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
final_annotated_umap_plot <- ggplot(umap_plot_df,aes(x = X1,y = X2,color = Condition,shape = Tissue)) + geom_point() # make a scatterplot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.differential.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
#### Differential - condition only
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.differential.condition.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:152]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
final_annotated_umap_plot <- ggplot(umap_plot_df,aes(x = X1,y = X2,color = Condition,shape = Tissue)) + geom_point() # make a scatterplot
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.differential.condition.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# grep 'olig' key.files/diffbind.hmmratac.sample.all.outlier.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.olig.outlier.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.olig.outlier.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.sample.olig.outlier.csv")
head(df$peaks[[1]])
# Chr Start End Score
# 20755 NC_000001.11 0 10610 1
# 20756 NC_000001.11 19830 29440 1
# 20757 NC_000001.11 180720 181360 1
# 20758 NC_000001.11 180960 182300 1
# 20759 NC_000001.11 189370 190410 1
# 20760 NC_000001.11 586040 587080 1
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.heatmap.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 olig control control 1 35419836 0.12
# 2 2 olig control control 1 39985688 0.07
# 3 3 olig control control 1 18410024 0.15
# 4 4 olig control control 1 33169147 0.07
# 5 5 olig control control 1 35730552 0.11
# 6 6 olig control control 1 30796132 0.11
# 7 7 olig control control 1 12721318 0.10
# 8 8 olig control control 1 14676769 0.06
# 9 9 olig control control 1 43746962 0.02
# 10 10 olig control control 1 17095336 0.05
# 11 11 olig control control 1 14587566 0.05
# 12 12 olig control control 1 12936951 0.09
# 13 13 olig control control 1 11352906 0.12
# 14 14 olig control control 1 15112980 0.12
# 15 15 olig control control 1 21742002 0.11
# 16 16 olig control control 1 26727988 0.06
# 17 17 olig control control 1 21531922 0.08
# 18 18 olig control control 1 22677852 0.16
# 19 19 olig control control 1 12134577 0.17
# 20 20 olig control control 1 19943332 0.08
# 21 21 olig control control 1 24681970 0.06
# 22 22 olig control control 1 8340420 0.06
# 23 23 olig control control 1 27769905 0.17
# 24 24 olig control control 1 6205234 0.08
# 25 25 olig control control 1 26965074 0.08
# 26 26 olig control control 1 20045962 0.12
# 27 27 olig control control 1 21634187 0.10
# 28 28 olig control control 1 9552678 0.12
# 29 29 olig control control 1 26725124 0.15
# 30 30 olig heroin heroin 1 17678860 0.11
# 31 31 olig heroin heroin 1 33878184 0.06
# 32 32 olig heroin heroin 1 34083352 0.04
# 33 33 olig heroin heroin 1 30492794 0.06
# 34 34 olig heroin heroin 1 18033990 0.08
# 35 35 olig heroin heroin 1 23300071 0.09
# 36 36 olig heroin heroin 1 22856649 0.12
# 37 37 olig heroin heroin 1 23347042 0.15
# 38 38 olig heroin heroin 1 17824007 0.12
# 39 39 olig heroin heroin 1 32371754 0.08
# 40 40 olig heroin heroin 1 37173477 0.07
# 41 41 olig heroin heroin 1 32197804 0.10
# 42 42 olig heroin heroin 1 38069042 0.04
# 43 43 olig heroin heroin 1 23155520 0.04
# 44 44 olig heroin heroin 1 24553068 0.10
# 45 45 olig heroin heroin 1 23520314 0.07
# 46 46 olig heroin heroin 1 26375644 0.09
# 47 47 olig heroin heroin 1 22820588 0.13
# 48 48 olig heroin heroin 1 27431814 0.12
# 49 49 olig heroin heroin 1 11751331 0.07
# 50 50 olig heroin heroin 1 15541901 0.13
# 51 51 olig heroin heroin 1 21521670 0.11
# 52 52 olig heroin heroin 1 22589463 0.07
# 53 53 olig heroin heroin 1 26240104 0.11
# 54 54 olig heroin heroin 1 27822724 0.07
# 55 55 olig heroin heroin 1 8818835 0.12
# 56 56 olig heroin heroin 1 10785076 0.16
### LOW FRiP scores (should be >0.3)... concerning???
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.counted.heatmap.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_CONDITION, label=DBA_CONDITION)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
names(df.counted)
# [1] "peaks" "class" "chrmap" "config" "samples"
# [6] "called" "score" "binding" "merged" "totalMerged"
# [11] "attributes" "minOverlap" "masks" "SN" "maxFilter"
# [16] "filterFun" "minCount" "summits" "meta" "contrasts"
# [21] "design" "DESeq2"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition heroin 23 control 24 30 0
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.differential.heatmap.pdf")
dba.plotHeatmap(df.analysed, contrast=1, method=DBA_EDGER)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.differential.heatmap2.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE, method=DBA_EDGER)
dev.off()
pdf("dba.hmmratac.olig.plotMA.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.olig.plotVolcano.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.olig.plotPCA.pdf")
dba.plotPCA(df.analysed, method=DBA_EDGER)
dev.off()
pdf("dba.hmmratac.olig.plotBox.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed, method=DBA_EDGER)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.olig.csv", sep="\t", quote=F, row.names=F)
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/ .
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# grep 'gaba' key.files/diffbind.hmmratac.sample.all.outlier.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.gaba.outlier.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.gaba.outlier.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.sample.gaba.outlier.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.gaba.heatmap.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 gaba control control 1 42725768 0.07
# 2 2 gaba control control 1 43704543 0.08
# 3 3 gaba control control 1 46220300 0.06
# 4 4 gaba control control 1 42498778 0.05
# 5 5 gaba control control 1 33952735 0.15
# 6 6 gaba control control 1 43848567 0.06
# 7 7 gaba control control 1 37167868 0.09
# 8 8 gaba control control 1 34548832 0.04
# 9 9 gaba control control 1 43780400 0.03
# 10 10 gaba control control 1 16734968 0.03
# 11 11 gaba control control 1 11692172 0.07
# 12 12 gaba control control 1 36431902 0.12
# 13 13 gaba control control 1 45130302 0.10
# 14 14 gaba control control 1 23520169 0.16
# 15 15 gaba control control 1 25494066 0.07
# 16 16 gaba control control 1 24408560 0.14
# 17 17 gaba control control 1 36244265 0.17
# 18 18 gaba control control 1 21035405 0.13
# 19 19 gaba control control 1 28541170 0.14
# 20 20 gaba control control 1 27219006 0.07
# 21 21 gaba control control 1 24313938 0.10
# 22 22 gaba control control 1 26060403 0.11
# 23 23 gaba control control 1 22798424 0.09
# 24 24 gaba control control 1 25728756 0.07
# 25 25 gaba control control 1 18636863 0.09
# 26 26 gaba control control 1 24340360 0.13
# 27 27 gaba control control 1 21645282 0.17
# 28 28 gaba control control 1 27332056 0.12
# 29 29 gaba heroin heroin 1 38209635 0.09
# 30 30 gaba heroin heroin 1 35455492 0.09
# 31 31 gaba heroin heroin 1 39350404 0.08
# 32 32 gaba heroin heroin 1 43190608 0.03
# 33 33 gaba heroin heroin 1 38504060 0.04
# 34 34 gaba heroin heroin 1 43133304 0.05
# 35 35 gaba heroin heroin 1 31649714 0.08
# 36 36 gaba heroin heroin 1 41832122 0.11
# 37 37 gaba heroin heroin 1 36760146 0.09
# 38 38 gaba heroin heroin 1 17267758 0.13
# 39 39 gaba heroin heroin 1 39684486 0.11
# 40 40 gaba heroin heroin 1 16119282 0.08
# 41 41 gaba heroin heroin 1 13521178 0.06
# 42 42 gaba heroin heroin 1 21057856 0.06
# 43 43 gaba heroin heroin 1 23836164 0.07
# 44 44 gaba heroin heroin 1 23572849 0.09
# 45 45 gaba heroin heroin 1 22521928 0.10
# 46 46 gaba heroin heroin 1 22137926 0.11
# 47 47 gaba heroin heroin 1 29906670 0.13
# 48 48 gaba heroin heroin 1 25420382 0.05
# 49 49 gaba heroin heroin 1 19021498 0.13
# 50 50 gaba heroin heroin 1 21048088 0.07
# 51 51 gaba heroin heroin 1 20019642 0.07
# 52 52 gaba heroin heroin 1 25704207 0.17
# 53 53 gaba heroin heroin 1 22191212 0.05
# 54 54 gaba heroin heroin 1 20833286 0.11
# 55 55 gaba heroin heroin 1 27920540 0.09
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.gaba.counted.heatmap.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.gaba.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_CONDITION, label=DBA_CONDITION)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition heroin 25 control 26 83 155
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.gaba.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.gaba.csv", sep="\t", quote=F, row.names=F)
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# grep 'glu' key.files/diffbind.hmmratac.sample.all.outlier.csv | cat key.files/diffbind.hmmratac.header.csv - > key.files/diffbind.hmmratac.sample.glu.outlier.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.glu.outlier.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.sample.glu.outlier.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.heatmap.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba.count(df, summits=250)
df.counted
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.counted.heatmap.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.counted.pca.pdf")
dba.plotPCA(df.counted, attributes=DBA_CONDITION, label=DBA_CONDITION)
dev.off()
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition heroin 25 control 26 155 1203
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.differential.venn.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.differential.heatmap.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.differential.heatmap2.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.glu.plotMA.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.glu.plotVolcano.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.glu.plotPCA.pdf")
dba.plotPCA(df.analysed)
dev.off()
pdf("dba.hmmratac.glu.plotBox.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.glu.csv", sep="\t", quote=F, row.names=F)
# mess with cutoffs to narrow in on PCA...
# add additional metadata and look at loading plot...
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
df <- read.delim("peaks.report.hmmratac.glu.csv", header=T, sep="\t")
umap(df.analysed,labels=DBA_CONDITION,controlscale=TRUE,scale=3)
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(DESeq2)
library(umap)
library(ggplot2)
library(magrittr)
library(dplyr)
# Set the seed so our results are reproducible:
set.seed(12345)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.sample.all.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.differential.condition.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:152]
counts.mat <- round(counts.df)
counts.olig <- counts.mat[,1:47]
counts.gaba <- counts.mat[,48:98]
counts.glu <- counts.mat[,99:149]
samples.olig <- samples[1:47,]
samples.gaba <- samples[48:98,]
samples.glu <- samples[99:149,]
# OLIG
dds <- DESeqDataSetFromMatrix(
countData = counts.olig, # the counts values for all samples in our dataset
colData = samples.olig, # annotation data for the samples in the counts data frame
design = ~ Condition
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples.olig$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples.olig, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.olig.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
# GABA
dds <- DESeqDataSetFromMatrix(
countData = counts.gaba, # the counts values for all samples in our dataset
colData = samples.gaba, # annotation data for the samples in the counts data frame
design = ~ Condition
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples.gaba$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples.gaba, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.gaba.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
# GLU
dds <- DESeqDataSetFromMatrix(
countData = counts.glu, # the counts values for all samples in our dataset
colData = samples.glu, # annotation data for the samples in the counts data frame
design = ~ Condition
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples.glu$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples.glu, by = "Sample.ID")
# Plot using `ggplot()` function and save to an object
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.glu.umap.outliers.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
# Summit <-- run out of time on summit... install samtools on andes
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge --time 02:00:00 --maxpernode 1 --nodes 6 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.commands.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/merge/bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bam.merge.index --time 02:00:00 --maxpernode 1 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.index.commands.txt
# Andes
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#conda install -c bioconda hmmratac
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.merge --time 24:00:00 --maxpernode 2 --nodes 6 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.andes.commands.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/merge/bam
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bam.merge.index --time 24:00:00 --maxpernode 2 --nodes 2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/commands/bam.merge.index.andes.commands.txt
# java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/merge/bam/control.olig.merge.bam -i bwa.output/merge/bam/control.olig.merge.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/merge/control.olig -e ref/hg38-blacklist.v2.ensembl.bed
# java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/merge/bam/control.gaba.merge.bam -i bwa.output/merge/bam/control.gaba.merge.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/merge/control.gaba -e ref/hg38-blacklist.v2.ensembl.bed
# java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/merge/bam/control.glu.merge.bam -i bwa.output/merge/bam/control.glu.merge.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/merge/control.glu -e ref/hg38-blacklist.v2.ensembl.bed
# java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/merge/bam/heroin.olig.merge.bam -i bwa.output/merge/bam/heroin.olig.merge.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/merge/heroin.olig -e ref/hg38-blacklist.v2.ensembl.bed
# java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/merge/bam/heroin.gaba.merge.bam -i bwa.output/merge/bam/heroin.gaba.merge.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/merge/heroin.gaba -e ref/hg38-blacklist.v2.ensembl.bed
# java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/HMMRATAC_V1.2.10_exe.jar -b bwa.output/merge/bam/heroin.glu.merge.bam -i bwa.output/merge/bam/heroin.glu.merge.bam.bai -g ref/GRCh38.p13.chr.size -o bwa.output/hmmratac/merge/heroin.glu -e ref/hg38-blacklist.v2.ensembl.bed
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name hmmratac --time 48:00:00 --maxpernode 1 --nodes 6 commands/hmmratac.merge.commands.txt
# create directory with only _summits.bed files
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac
mkdir -p peaks.summit
cp *_summits.bed peaks.summit/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac/peaks.summit
for i in *.bed; do
awk '{print $1"\t"($2-50)"\t"($3+50)"\t"$4"\t"$5}' $i > $i.out
done
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
## [1] "SampleID" "Tissue" "Factor" "Condition" "Treatment"
## [6] "Replicate" "bamReads" "ControlID" "bamControl" "Peaks"
## [11] "PeakCaller"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("atac.metadata.txt", header=F, sep="\t")
library(dplyr)
library(tidyr)
df2 <- separate(df, V1, c("Sample.ID", "SampleSeq"), sep="_")
df3 <- separate(df2, V2, c("Condition", "Tissue"))
df3$PeakName <- df$V1
df3$PeakCaller <- "bed"
df3$PeaksDir <- "bwa.output/hmmratac/peaks.summit/"
df3$PeaksExt <- "_summits.bed.out"
df3$BamName <- df$V1
df3$BamDir <- "bwa.output/rmdups.bam/"
df3$BamExt <- ".rmdups.bam"
df4 <- unite(df3, Peaks, c(PeaksDir, PeakName, PeaksExt), sep="")
df5 <- unite(df4, bamReads, c(BamDir, BamName, BamExt), sep="")
df5$Factor <- NA
df5$Treatment <- df5$Condition
df5$Replicate <- 1
df5$Control.ID <- NA
df5$bamControl <- NA
df.sample <- df5[,c(1,4,8,3,9,10,7,11,12,5,6)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
write.csv(df.sample, file = 'diffbind.hmmratac.summit.sample.csv', row.names=F)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
head -1 key.files/diffbind.hmmratac.summit.sample.csv > key.files/diffbind.hmmratac.summit.header.csv
grep 'olig' key.files/diffbind.hmmratac.summit.sample.csv | cat key.files/diffbind.hmmratac.summit.header.csv - > key.files/diffbind.hmmratac.sample.summit.olig.csv
grep 'gaba' key.files/diffbind.hmmratac.summit.sample.csv | cat key.files/diffbind.hmmratac.summit.header.csv - > key.files/diffbind.hmmratac.sample.summit.gaba.csv
grep 'glu' key.files/diffbind.hmmratac.summit.sample.csv | cat key.files/diffbind.hmmratac.summit.header.csv - > key.files/diffbind.hmmratac.sample.summit.glu.csv
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# sed 's/_peaks.gappedPeak.col.bed/_summits.bed.out/g' key.files/diffbind.hmmratac.sample.all.outlier2.csv | sed 's/peaks.coord/peaks.summit/g' > key.files/diffbind.hmmratac.summit.sample.outlier.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.summit.sample.outlier.csv")
head(df$peaks[[1]])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.heatmap.outliers.narrowpeak.pdf")
dba.plotHeatmap(df)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#df.counted <- dba.count(df, summits=250)
#df.counted
# 148 Samples, 145688 sites in matrix:
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 olig control control 1 35419836 0.20
# 2 2 olig control control 1 39985688 0.14
# 3 3 olig control control 1 18410024 0.24
# 4 4 olig control control 1 33169147 0.12
# 5 5 olig control control 1 35730552 0.18
# 6 6 olig control control 1 30796132 0.19
# 7 7 olig control control 1 12721318 0.18
# 8 8 olig control control 1 14676769 0.12
# 9 9 olig control control 1 43746962 0.07
# 10 10 olig control control 1 17095336 0.10
# 11 11 olig control control 1 12936951 0.16
# 12 12 olig control control 1 11352906 0.21
# 13 13 olig control control 1 15112980 0.21
# 14 14 olig control control 1 21742002 0.18
# 15 15 olig control control 1 26727988 0.11
# 16 16 olig control control 1 21531922 0.14
# 17 17 olig control control 1 22677852 0.24
# 18 18 olig control control 1 19943332 0.15
# 19 19 olig control control 1 24681970 0.11
# 20 20 olig control control 1 27769905 0.27
# 21 21 olig control control 1 26965074 0.15
# 22 22 olig control control 1 20045962 0.18
# 23 23 olig control control 1 21634187 0.16
# 24 24 olig control control 1 26725124 0.23
# 25 25 olig heroin heroin 1 17678860 0.19
# 26 26 olig heroin heroin 1 33878184 0.12
# 27 27 olig heroin heroin 1 34083352 0.08
# 28 28 olig heroin heroin 1 30492794 0.11
# 29 29 olig heroin heroin 1 18033990 0.14
# 30 30 olig heroin heroin 1 23300071 0.16
# 31 31 olig heroin heroin 1 22856649 0.20
# 32 32 olig heroin heroin 1 23347042 0.25
# 33 33 olig heroin heroin 1 17824007 0.20
# 34 34 olig heroin heroin 1 32371754 0.15
# 35 35 olig heroin heroin 1 37173477 0.13
# 36 36 olig heroin heroin 1 32197804 0.17
# 37 37 olig heroin heroin 1 38069042 0.08
# 38 38 olig heroin heroin 1 23155520 0.08
# 39 39 olig heroin heroin 1 24553068 0.17
# 40 40 olig heroin heroin 1 23520314 0.13
# 41 41 olig heroin heroin 1 26375644 0.15
# 42 42 olig heroin heroin 1 22820588 0.21
# 43 43 olig heroin heroin 1 27431814 0.20
# 44 44 olig heroin heroin 1 21521670 0.19
# 45 45 olig heroin heroin 1 22589463 0.14
# 46 46 olig heroin heroin 1 26240104 0.19
# 47 47 olig heroin heroin 1 27822724 0.13
# 48 48 gaba control control 1 42725768 0.12
# 49 49 gaba control control 1 43704543 0.14
# 50 50 gaba control control 1 46220300 0.11
# 51 51 gaba control control 1 42498778 0.10
# 52 52 gaba control control 1 33952735 0.25
# 53 53 gaba control control 1 43848567 0.11
# 54 54 gaba control control 1 37167868 0.16
# 55 55 gaba control control 1 34548832 0.09
# 56 56 gaba control control 1 43780400 0.07
# 57 57 gaba control control 1 11692172 0.14
# 58 58 gaba control control 1 36431902 0.21
# 59 59 gaba control control 1 45130302 0.17
# 60 60 gaba control control 1 23520169 0.24
# 61 61 gaba control control 1 25494066 0.12
# 62 62 gaba control control 1 24408560 0.22
# 63 63 gaba control control 1 36244265 0.26
# 64 64 gaba control control 1 21035405 0.20
# 65 65 gaba control control 1 28541170 0.22
# 66 66 gaba control control 1 27219006 0.13
# 67 67 gaba control control 1 24313938 0.18
# 68 68 gaba control control 1 26060403 0.19
# 69 69 gaba control control 1 22798424 0.15
# 70 70 gaba control control 1 25728756 0.12
# 71 71 gaba control control 1 18636863 0.14
# 72 72 gaba control control 1 21645282 0.27
# 73 73 gaba control control 1 27332056 0.18
# 74 74 gaba heroin heroin 1 38209635 0.17
# 75 75 gaba heroin heroin 1 39350404 0.14
# 76 76 gaba heroin heroin 1 43190608 0.07
# 77 77 gaba heroin heroin 1 38504060 0.08
# 78 78 gaba heroin heroin 1 43133304 0.11
# 79 79 gaba heroin heroin 1 31649714 0.14
# 80 80 gaba heroin heroin 1 41832122 0.18
# 81 81 gaba heroin heroin 1 36760146 0.16
# 82 82 gaba heroin heroin 1 17267758 0.20
# 83 83 gaba heroin heroin 1 39684486 0.19
# 84 84 gaba heroin heroin 1 16119282 0.14
# 85 85 gaba heroin heroin 1 13521178 0.12
# 86 86 gaba heroin heroin 1 21057856 0.12
# 87 87 gaba heroin heroin 1 23836164 0.13
# 88 88 gaba heroin heroin 1 23572849 0.16
# 89 89 gaba heroin heroin 1 22521928 0.16
# 90 90 gaba heroin heroin 1 22137926 0.18
# 91 91 gaba heroin heroin 1 29906670 0.21
# 92 92 gaba heroin heroin 1 25420382 0.10
# 93 93 gaba heroin heroin 1 21048088 0.14
# 94 94 gaba heroin heroin 1 20019642 0.14
# 95 95 gaba heroin heroin 1 25704207 0.26
# 96 96 gaba heroin heroin 1 22191212 0.10
# 97 97 gaba heroin heroin 1 27920540 0.17
# 98 98 glu control control 1 46493165 0.17
# 99 99 glu control control 1 44525174 0.19
# 100 100 glu control control 1 37405883 0.30
# 101 101 glu control control 1 33724862 0.18
# 102 102 glu control control 1 35094142 0.21
# 103 103 glu control control 1 50605114 0.23
# 104 104 glu control control 1 33465855 0.16
# 105 105 glu control control 1 31493600 0.09
# 106 106 glu control control 1 34340810 0.13
# 107 107 glu control control 1 37350754 0.16
# 108 108 glu control control 1 35822337 0.19
# 109 109 glu control control 1 27296926 0.31
# 110 110 glu control control 1 25484620 0.15
# 111 111 glu control control 1 25647366 0.29
# 112 112 glu control control 1 25086894 0.25
# 113 113 glu control control 1 23904608 0.21
# 114 114 glu control control 1 25171322 0.20
# 115 115 glu control control 1 41608044 0.13
# 116 116 glu control control 1 24159262 0.18
# 117 117 glu control control 1 24962683 0.18
# 118 118 glu control control 1 23141216 0.24
# 119 119 glu control control 1 25038620 0.10
# 120 120 glu control control 1 24710482 0.20
# 121 121 glu control control 1 12117088 0.24
# 122 122 glu control control 1 26730796 0.26
# 123 123 glu control control 1 25835844 0.23
# 124 124 glu heroin heroin 1 34920064 0.19
# 125 125 glu heroin heroin 1 37199359 0.21
# 126 126 glu heroin heroin 1 40031386 0.10
# 127 127 glu heroin heroin 1 39698089 0.18
# 128 128 glu heroin heroin 1 43634936 0.17
# 129 129 glu heroin heroin 1 37132804 0.16
# 130 130 glu heroin heroin 1 52881648 0.14
# 131 131 glu heroin heroin 1 48183170 0.20
# 132 132 glu heroin heroin 1 47100392 0.17
# 133 133 glu heroin heroin 1 44330147 0.25
# 134 134 glu heroin heroin 1 32418334 0.20
# 135 135 glu heroin heroin 1 36583276 0.24
# 136 136 glu heroin heroin 1 39251762 0.13
# 137 137 glu heroin heroin 1 27322639 0.13
# 138 138 glu heroin heroin 1 24944148 0.13
# 139 139 glu heroin heroin 1 27341052 0.16
# 140 140 glu heroin heroin 1 25391272 0.17
# 141 141 glu heroin heroin 1 23019618 0.18
# 142 142 glu heroin heroin 1 22966732 0.29
# 143 143 glu heroin heroin 1 22320260 0.16
# 144 144 glu heroin heroin 1 21695518 0.15
# 145 145 glu heroin heroin 1 24645000 0.14
# 146 146 glu heroin heroin 1 24291834 0.14
# 147 147 glu heroin heroin 1 23011422 0.21
# 148 148 glu heroin heroin 1 28405218 0.21
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.counted.heatmap.outliers.narrowpeak.pdf")
dba.plotHeatmap(df.counted)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- dba.peakset(df.counted, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.narrowpeak.peakset.counts.txt", sep="\t", quote=F, row.names=F)
df.counted <- dba.count(df)
df.counted
# 148 Samples, 150627 sites in matrix:
# ID Tissue Condition Treatment Replicate Reads FRiP
# 1 1 olig control control 1 35419836 0.19
# 2 2 olig control control 1 39985688 0.13
# 3 3 olig control control 1 18410024 0.23
# 4 4 olig control control 1 33169147 0.11
# 5 5 olig control control 1 35730552 0.17
# 6 6 olig control control 1 30796132 0.17
# 7 7 olig control control 1 12721318 0.17
# 8 8 olig control control 1 14676769 0.11
# 9 9 olig control control 1 43746962 0.06
# 10 10 olig control control 1 17095336 0.09
# 11 11 olig control control 1 12936951 0.15
# 12 12 olig control control 1 11352906 0.19
# 13 13 olig control control 1 15112980 0.19
# 14 14 olig control control 1 21742002 0.17
# 15 15 olig control control 1 26727988 0.10
# 16 16 olig control control 1 21531922 0.13
# 17 17 olig control control 1 22677852 0.23
# 18 18 olig control control 1 19943332 0.13
# 19 19 olig control control 1 24681970 0.10
# 20 20 olig control control 1 27769905 0.25
# 21 21 olig control control 1 26965074 0.14
# 22 22 olig control control 1 20045962 0.17
# 23 23 olig control control 1 21634187 0.15
# 24 24 olig control control 1 26725124 0.22
# 25 25 olig heroin heroin 1 17678860 0.18
# 26 26 olig heroin heroin 1 33878184 0.11
# 27 27 olig heroin heroin 1 34083352 0.08
# 28 28 olig heroin heroin 1 30492794 0.10
# 29 29 olig heroin heroin 1 18033990 0.13
# 30 30 olig heroin heroin 1 23300071 0.14
# 31 31 olig heroin heroin 1 22856649 0.19
# 32 32 olig heroin heroin 1 23347042 0.23
# 33 33 olig heroin heroin 1 17824007 0.19
# 34 34 olig heroin heroin 1 32371754 0.13
# 35 35 olig heroin heroin 1 37173477 0.12
# 36 36 olig heroin heroin 1 32197804 0.16
# 37 37 olig heroin heroin 1 38069042 0.08
# 38 38 olig heroin heroin 1 23155520 0.07
# 39 39 olig heroin heroin 1 24553068 0.16
# 40 40 olig heroin heroin 1 23520314 0.12
# 41 41 olig heroin heroin 1 26375644 0.14
# 42 42 olig heroin heroin 1 22820588 0.20
# 43 43 olig heroin heroin 1 27431814 0.18
# 44 44 olig heroin heroin 1 21521670 0.18
# 45 45 olig heroin heroin 1 22589463 0.13
# 46 46 olig heroin heroin 1 26240104 0.18
# 47 47 olig heroin heroin 1 27822724 0.12
# 48 48 gaba control control 1 42725768 0.11
# 49 49 gaba control control 1 43704543 0.13
# 50 50 gaba control control 1 46220300 0.10
# 51 51 gaba control control 1 42498778 0.09
# 52 52 gaba control control 1 33952735 0.23
# 53 53 gaba control control 1 43848567 0.10
# 54 54 gaba control control 1 37167868 0.14
# 55 55 gaba control control 1 34548832 0.08
# 56 56 gaba control control 1 43780400 0.06
# 57 57 gaba control control 1 11692172 0.13
# 58 58 gaba control control 1 36431902 0.19
# 59 59 gaba control control 1 45130302 0.16
# 60 60 gaba control control 1 23520169 0.23
# 61 61 gaba control control 1 25494066 0.11
# 62 62 gaba control control 1 24408560 0.20
# 63 63 gaba control control 1 36244265 0.24
# 64 64 gaba control control 1 21035405 0.19
# 65 65 gaba control control 1 28541170 0.21
# 66 66 gaba control control 1 27219006 0.12
# 67 67 gaba control control 1 24313938 0.16
# 68 68 gaba control control 1 26060403 0.18
# 69 69 gaba control control 1 22798424 0.14
# 70 70 gaba control control 1 25728756 0.11
# 71 71 gaba control control 1 18636863 0.13
# 72 72 gaba control control 1 21645282 0.25
# 73 73 gaba control control 1 27332056 0.17
# 74 74 gaba heroin heroin 1 38209635 0.16
# 75 75 gaba heroin heroin 1 39350404 0.13
# 76 76 gaba heroin heroin 1 43190608 0.07
# 77 77 gaba heroin heroin 1 38504060 0.07
# 78 78 gaba heroin heroin 1 43133304 0.10
# 79 79 gaba heroin heroin 1 31649714 0.13
# 80 80 gaba heroin heroin 1 41832122 0.17
# 81 81 gaba heroin heroin 1 36760146 0.15
# 82 82 gaba heroin heroin 1 17267758 0.19
# 83 83 gaba heroin heroin 1 39684486 0.18
# 84 84 gaba heroin heroin 1 16119282 0.13
# 85 85 gaba heroin heroin 1 13521178 0.11
# 86 86 gaba heroin heroin 1 21057856 0.11
# 87 87 gaba heroin heroin 1 23836164 0.12
# 88 88 gaba heroin heroin 1 23572849 0.14
# 89 89 gaba heroin heroin 1 22521928 0.15
# 90 90 gaba heroin heroin 1 22137926 0.17
# 91 91 gaba heroin heroin 1 29906670 0.20
# 92 92 gaba heroin heroin 1 25420382 0.09
# 93 93 gaba heroin heroin 1 21048088 0.13
# 94 94 gaba heroin heroin 1 20019642 0.12
# 95 95 gaba heroin heroin 1 25704207 0.24
# 96 96 gaba heroin heroin 1 22191212 0.09
# 97 97 gaba heroin heroin 1 27920540 0.16
# 98 98 glu control control 1 46493165 0.16
# 99 99 glu control control 1 44525174 0.17
# 100 100 glu control control 1 37405883 0.28
# 101 101 glu control control 1 33724862 0.17
# 102 102 glu control control 1 35094142 0.20
# 103 103 glu control control 1 50605114 0.21
# 104 104 glu control control 1 33465855 0.14
# 105 105 glu control control 1 31493600 0.08
# 106 106 glu control control 1 34340810 0.12
# 107 107 glu control control 1 37350754 0.15
# 108 108 glu control control 1 35822337 0.18
# 109 109 glu control control 1 27296926 0.29
# 110 110 glu control control 1 25484620 0.14
# 111 111 glu control control 1 25647366 0.27
# 112 112 glu control control 1 25086894 0.24
# 113 113 glu control control 1 23904608 0.19
# 114 114 glu control control 1 25171322 0.18
# 115 115 glu control control 1 41608044 0.12
# 116 116 glu control control 1 24159262 0.16
# 117 117 glu control control 1 24962683 0.17
# 118 118 glu control control 1 23141216 0.23
# 119 119 glu control control 1 25038620 0.09
# 120 120 glu control control 1 24710482 0.19
# 121 121 glu control control 1 12117088 0.23
# 122 122 glu control control 1 26730796 0.24
# 123 123 glu control control 1 25835844 0.22
# 124 124 glu heroin heroin 1 34920064 0.17
# 125 125 glu heroin heroin 1 37199359 0.19
# 126 126 glu heroin heroin 1 40031386 0.09
# 127 127 glu heroin heroin 1 39698089 0.16
# 128 128 glu heroin heroin 1 43634936 0.16
# 129 129 glu heroin heroin 1 37132804 0.15
# 130 130 glu heroin heroin 1 52881648 0.13
# 131 131 glu heroin heroin 1 48183170 0.18
# 132 132 glu heroin heroin 1 47100392 0.15
# 133 133 glu heroin heroin 1 44330147 0.24
# 134 134 glu heroin heroin 1 32418334 0.19
# 135 135 glu heroin heroin 1 36583276 0.22
# 136 136 glu heroin heroin 1 39251762 0.12
# 137 137 glu heroin heroin 1 27322639 0.12
# 138 138 glu heroin heroin 1 24944148 0.12
# 139 139 glu heroin heroin 1 27341052 0.15
# 140 140 glu heroin heroin 1 25391272 0.16
# 141 141 glu heroin heroin 1 23019618 0.17
# 142 142 glu heroin heroin 1 22966732 0.27
# 143 143 glu heroin heroin 1 22320260 0.15
# 144 144 glu heroin heroin 1 21695518 0.14
# 145 145 glu heroin heroin 1 24645000 0.12
# 146 146 glu heroin heroin 1 24291834 0.13
# 147 147 glu heroin heroin 1 23011422 0.19
# 148 148 glu heroin heroin 1 28405218 0.20
### Differential - Condition + Tissue
df.counted <- dba.contrast(df.counted, categories=c(DBA_CONDITION,DBA_TISSUE))
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Tissue olig 47 gaba 50 115395 115738
# 2 Tissue olig 47 glu 51 129348 129566
# 3 Tissue gaba 50 glu 51 95684 96252
# 4 Treatment heroin 72 control 76 4954 5611
# Factor Group Samples Group2 Samples2 DB.edgeR DB.DESeq2
# 1 Condition control 76 heroin 72 5499 6148
# 2 Tissue olig 47 gaba 50 119257 119630
# 3 Tissue olig 47 glu 51 133551 133765
# 4 Tissue glu 51 gaba 50 98110 98758
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- dba.peakset(df.analysed, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.peakset.narrowpeak.differential.condition.tissue.counts.txt", sep="\t", quote=F, row.names=F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.venn.outliers.narrowpeak.condition.tissue.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap.outliers.narrowpeak.condition.tissue.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all-cell.differential.heatmap2.outliers.narrowpeak.condition.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.hmmratac.all-cell.plotMA.outliers.narrowpeak.condition.tissue.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotVolcano.outliers.narrowpeak.condition.tissue.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.hmmratac.all-cell.plotPCA.outliers.narrowpeak.condition.tissue.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.hmmratac.all-cell.plotBox.outliers.narrowpeak.condition.tissue.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.hmmratac.all.narrowpeak.outliers.condition.tissue.csv", sep="\t", quote=F, row.names=F)
###### August 2021
###### consensus peak calling
write.table(df$peaks[[1]], "diffbind.peaks.txt", quote=F, row.names=F, sep="\t")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
# 154 Samples, 164756 sites in matrix (264282 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 1 olig control control 1 29032
# 2 2 olig control control 1 26109
# 3 3 olig control control 1 21283
# 4 4 olig control control 1 17434
# 5 5 olig control control 1 26008
# 6 6 olig control control 1 26520
# 7 7 olig control control 1 17306
# 8 8 olig control control 1 6443
# 9 9 olig control control 1 8157
# 10 10 olig control control 1 6417
# 11 11 olig control control 1 9147
# 12 12 olig control control 1 13954
# 13 13 olig control control 1 18657
# 14 14 olig control control 1 18421
# 15 15 olig control control 1 16130
# 16 16 olig control control 1 13823
# 17 17 olig control control 1 23046
# 18 18 olig control control 1 14478
# 19 19 olig control control 1 15281
# 20 20 olig control control 1 43433
# 21 21 olig control control 1 24317
# 22 22 olig control control 1 21075
# 23 23 olig control control 1 18027
# 24 24 olig control control 1 29685
# 25 25 olig heroin heroin 1 22145
# 26 26 olig heroin heroin 1 17825
# 27 27 olig heroin heroin 1 11362
# 28 28 olig heroin heroin 1 16823
# 29 29 olig heroin heroin 1 11474
# 30 30 olig heroin heroin 1 18099
# 31 31 olig heroin heroin 1 22819
# 32 32 olig heroin heroin 1 21158
# 33 33 olig heroin heroin 1 23807
# 34 34 olig heroin heroin 1 21414
# 35 35 olig heroin heroin 1 27465
# 36 36 olig heroin heroin 1 32288
# 37 37 olig heroin heroin 1 18006
# 38 38 olig heroin heroin 1 7793
# 39 39 olig heroin heroin 1 27147
# 40 40 olig heroin heroin 1 16194
# 41 41 olig heroin heroin 1 24684
# 42 42 olig heroin heroin 1 26964
# 43 43 olig heroin heroin 1 29342
# 44 44 olig heroin heroin 1 18771
# 45 45 olig heroin heroin 1 18824
# 46 46 olig heroin heroin 1 31472
# 47 47 olig heroin heroin 1 20327
# 48 48 gaba control control 1 24545
# 49 49 gaba control control 1 32246
# 50 50 gaba control control 1 23614
# 51 51 gaba control control 1 19457
# 52 52 gaba control control 1 42701
# 53 53 gaba control control 1 21587
# 54 54 gaba control control 1 24086
# 55 55 gaba control control 1 9076
# 56 56 gaba control control 1 5998
# 57 57 gaba control control 1 5423
# 58 58 gaba control control 1 34138
# 59 59 gaba control control 1 37469
# 60 60 gaba control control 1 33974
# 61 61 gaba control control 1 16549
# 62 62 gaba control control 1 22743
# 63 63 gaba control control 1 48294
# 64 64 gaba control control 1 23760
# 65 65 gaba control control 1 28689
# 66 66 gaba control control 1 16268
# 67 67 gaba control control 1 24659
# 68 68 gaba control control 1 21780
# 69 69 gaba control control 1 19174
# 70 70 gaba control control 1 14876
# 71 71 gaba control control 1 14740
# 72 72 gaba control control 1 33002
# 73 73 gaba control control 1 30544
# 74 74 gaba heroin heroin 1 35624
# 75 75 gaba heroin heroin 1 32727
# 76 76 gaba heroin heroin 1 6808
# 77 77 gaba heroin heroin 1 8554
# 78 78 gaba heroin heroin 1 22055
# 79 79 gaba heroin heroin 1 19964
# 80 80 gaba heroin heroin 1 36778
# 81 81 gaba heroin heroin 1 32745
# 82 82 gaba heroin heroin 1 17300
# 83 83 gaba heroin heroin 1 41946
# 84 84 gaba heroin heroin 1 13213
# 85 85 gaba heroin heroin 1 3831
# 86 86 gaba heroin heroin 1 11000
# 87 87 gaba heroin heroin 1 15277
# 88 88 gaba heroin heroin 1 21535
# 89 89 gaba heroin heroin 1 24438
# 90 90 gaba heroin heroin 1 24630
# 91 91 gaba heroin heroin 1 35503
# 92 92 gaba heroin heroin 1 9818
# 93 93 gaba heroin heroin 1 10632
# 94 94 gaba heroin heroin 1 11842
# 95 95 gaba heroin heroin 1 28456
# 96 96 gaba heroin heroin 1 8986
# 97 97 gaba heroin heroin 1 22478
# 98 98 glu control control 1 47068
# 99 99 glu control control 1 53067
# 100 100 glu control control 1 74486
# 101 101 glu control control 1 34854
# 102 102 glu control control 1 37625
# 103 103 glu control control 1 64025
# 104 104 glu control control 1 28061
# 105 105 glu control control 1 10506
# 106 106 glu control control 1 32681
# 107 107 glu control control 1 36195
# 108 108 glu control control 1 47090
# 109 109 glu control control 1 60838
# 110 110 glu control control 1 22602
# 111 111 glu control control 1 50575
# 112 112 glu control control 1 47815
# 113 113 glu control control 1 35105
# 114 114 glu control control 1 32028
# 115 115 glu control control 1 30466
# 116 116 glu control control 1 23867
# 117 117 glu control control 1 29987
# 118 118 glu control control 1 33924
# 119 119 glu control control 1 7979
# 120 120 glu control control 1 36745
# 121 121 glu control control 1 136
# 122 122 glu control control 1 46002
# 123 123 glu control control 1 40632
# 124 124 glu heroin heroin 1 41546
# 125 125 glu heroin heroin 1 56011
# 126 126 glu heroin heroin 1 16980
# 127 127 glu heroin heroin 1 42687
# 128 128 glu heroin heroin 1 50208
# 129 129 glu heroin heroin 1 36072
# 130 130 glu heroin heroin 1 54434
# 131 131 glu heroin heroin 1 56322
# 132 132 glu heroin heroin 1 56311
# 133 133 glu heroin heroin 1 65811
# 134 134 glu heroin heroin 1 38417
# 135 135 glu heroin heroin 1 58009
# 136 136 glu heroin heroin 1 24746
# 137 137 glu heroin heroin 1 16008
# 138 138 glu heroin heroin 1 13214
# 139 139 glu heroin heroin 1 24305
# 140 140 glu heroin heroin 1 34243
# 141 141 glu heroin heroin 1 26657
# 142 142 glu heroin heroin 1 42541
# 143 143 glu heroin heroin 1 17302
# 144 144 glu heroin heroin 1 12472
# 145 145 glu heroin heroin 1 15040
# 146 146 glu heroin heroin 1 17516
# 147 147 glu heroin heroin 1 29186
# 148 148 glu heroin heroin 1 32651
# 149 olig:control olig control control 1 11772
# 150 olig:heroin olig heroin heroin 1 13148
# 151 gaba:control gaba control control 1 14197
# 152 gaba:heroin gaba heroin heroin 1 10687
# 153 glu:control glu control control 1 19460
# 154 glu:heroin glu heroin heroin 1 16278
write.table(df.counted$peaks[[154]], "diffbind.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
# 16278
write.table(df.counted$peaks[[153]], "diffbind.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
# 19460
write.table(df.counted$peaks[[152]], "diffbind.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
# 10687
write.table(df.counted$peaks[[151]], "diffbind.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
# 14197
write.table(df.counted$peaks[[150]], "diffbind.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
# 13148
write.table(df.counted$peaks[[149]], "diffbind.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
# 11772
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
# 6 Samples, 20838 sites in matrix (28813 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 11772
# 2 olig:heroin olig heroin heroin 1 13148
# 3 gaba:control gaba control control 1 14197
# 4 gaba:heroin gaba heroin heroin 1 10687
# 5 glu:control glu control control 1 19460
# 6 glu:heroin glu heroin heroin 1 16278
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.66)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
# 2 Samples, 8694 sites in matrix (10403 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 9491
# 2 heroin olig-gaba-glu heroin heroin 1 9606
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.66)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
# 3 Samples, 12026 sites in matrix (24513 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 12168
# 2 gaba gaba control-heroin control-heroin 1 12830
# 3 glu glu control-heroin control-heroin 1 18400
pdf("consensus.condition.overlap.pdf")
dba.plotVenn(df.condition,df.condition$masks$Consensus)
dev.off()
pdf("consensus.cell.overlap.pdf")
dba.plotVenn(df.cell,df.cell$masks$Consensus)
dev.off()
#### differential consensus peaks
library(tidygenomics)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac")
glu.heroin <- read.delim("diffbind.consensus.glu.heroin.txt", header=T, sep="\t")
glu.control <- read.delim("diffbind.consensus.glu.control.txt", header=T, sep="\t")
gaba.heroin <- read.delim("diffbind.consensus.gaba.heroin.txt", header=T, sep="\t")
gaba.control <- read.delim("diffbind.consensus.gaba.control.txt", header=T, sep="\t")
olig.heroin <- read.delim("diffbind.consensus.olig.heroin.txt", header=T, sep="\t")
olig.control <- read.delim("diffbind.consensus.olig.control.txt", header=T, sep="\t")
glu.heroin$glu.heroin.peak <- seq.int(nrow(glu.heroin))
glu.control$glu.control.peak <- seq.int(nrow(glu.control))
gaba.heroin$gaba.heroin.peak <- seq.int(nrow(gaba.heroin))
gaba.control$gaba.control.peak <- seq.int(nrow(gaba.control))
olig.heroin$olig.heroin.peak <- seq.int(nrow(olig.heroin))
olig.control$olig.control.peak <- seq.int(nrow(olig.control))
glu <- genome_intersect(glu.heroin, glu.control, by=c("Chr", "Start", "End"))
glu.heroin.only <- subset(glu.heroin, !(glu.heroin$glu.heroin.peak %in% glu$glu.heroin.peak))
glu.control.only <- subset(glu.control, !(glu.control$glu.control.peak %in% glu$glu.control.peak))
# glu intersect = 14636, heroin only = 1642, control only = 4824
gaba <- genome_intersect(gaba.heroin, gaba.control, by=c("Chr", "Start", "End"))
gaba.heroin.only <- subset(gaba.heroin, !(gaba.heroin$gaba.heroin.peak %in% gaba$gaba.heroin.peak))
gaba.control.only <- subset(gaba.control, !(gaba.control$gaba.control.peak %in% gaba$gaba.control.peak))
# gaba intersect = 10144, heroin only = 543, control only = 4053
olig <- genome_intersect(olig.heroin, olig.control, by=c("Chr", "Start", "End"))
olig.heroin.only <- subset(olig.heroin, !(olig.heroin$olig.heroin.peak %in% olig$olig.heroin.peak))
olig.control.only <- subset(olig.control, !(olig.control$olig.control.peak %in% olig$olig.control.peak))
# olig intersect = 10762, heroin only = 2386, control only = 1010
glu.gaba.heroin <- genome_intersect(glu.heroin.only, gaba.heroin.only, by=c("Chr", "Start", "End"))
glu.gaba.olig.heroin <- genome_intersect(glu.gaba.heroin, olig.heroin.only, by=c("Chr", "Start", "End"))
# 9
glu.gaba.control <- genome_intersect(glu.control.only, gaba.control.only, by=c("Chr", "Start", "End"))
glu.gaba.olig.control <- genome_intersect(glu.gaba.control, olig.control.only, by=c("Chr", "Start", "End"))
# 40
df.count <- dba.count(df)
df.contrast <- dba.contrast(df.count, categories=c(DBA_CONDITION,DBA_TISSUE))
df.condition.contrast <- dba.contrast(df.condition, categories=c(DBA_CONDITION))
df.contrast <- dba.contrast(df.cell, categories=c(DBA_TISSUE))
df.contrast.glu = dba.contrast(df.count,categories=DBA_CONDITION, block=df.count$masks$glu)
#Now when the analysis is run, it will be run using both the single-factor comparison as well as fitting a linear model with the second, blocking factor, for comparison:
df.contrast.glu.analyze = dba.analyze(df.contrast.glu)
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(DESeq2)
library(umap)
library(ggplot2)
library(magrittr)
library(dplyr)
# Set the seed so our results are reproducible:
set.seed(12345)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.narrowpeak.peakset.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:151]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples, by = "Sample.ID")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.umap.outliers.narrowpeak.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
#### differential
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.narrowpeak.differential.condition.tissue.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:151]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(samples, by = "Sample.ID")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.umap.outliers.narrowpeak.differential.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = Condition, shape = Tissue)) + geom_point(size=3) + theme_classic()
dev.off()
https://vanheeringen-lab.github.io/seq2science/content/gettingstarted.html
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda config --add channels defaults
# conda config --add channels bioconda
# conda config --add channels conda-forge
# conda create -n seq2science seq2science
conda activate seq2science
### /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/seq2science
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac
mkdir seq2science
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/seq2science
seq2science init atac-seq
seq2science run atac-seq --cores 10
# find . -name 'trimmed_*' | while read f; do mv "$f" "${f/trimmed_//}"; done
### alter config.yaml to use local directories
# genome_dir: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref
# fastq_dir: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/trimmed
# technical_replicates: keep
# biological_replicate: keep
# contrasts:
# - 'condition_control_heroin'
# - 'replicate_control.olig_heroin.olig'
# - 'replicate_control.gaba_heroin.gaba'
# - 'replicate_control.glu_heroin.glu'
### make sample.tsv file with (sample, assembly, technical_replicate, descriptive_name, control)
seq2science run atac-seq --cores 10
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("atac.metadata.txt", header=F, sep="\t")
colnames(df) <- c("sample", "replicate")
df$assembly <- "GCF_000001405.39_GRCh38.p13"
df$name <- df$sample
library(dplyr)
library(tidyr)
df2 <- separate(df, name, c("descriptive_name", "seq"), sep="_")
df2$control <- "NA"
df2$id <- df2$replicate
df3 <- separate(df2, id, c("condition", "tissue"))
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/seq2science")
write.table(df3, 'samplesR1.tsv', quote=F, row.names=F, sep="\t")
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/seq2science
# sed 's/.R1//g' samplesR1.tsv > samples.tsv
/gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq
# Data aligned to hg38 (utlizing diff chr names)...
## To view on IGV with our peak samples... adjust chromsome names...
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq
sed 's/chr10/NC_000010.11/g' H.276.OLIG_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.276.OLIG_peaks.chrfix.broadPeak
# 68563 H.276.OLIG_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.372.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.372.GLU_peaks.chrfix.broadPeak
# 94325 H.372.GLU_peaks.chrfix.broadPeak
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_Peaks
sed 's/chr10/NC_000010.11/g' H.276.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.276.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.286.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.286.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.344.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.344.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.372.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.372.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.395.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.395.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.406.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.406.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.412.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.412.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.427.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.427.SOX_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.444.SOX_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.444.SOX_peaks.chrfix.broadPeak
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GLU/Mt_Sinai_Peaks
sed 's/chr10/NC_000010.11/g' H.276.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.276.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.286.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.286.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.344.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.344.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.372.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.372.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.395.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.395.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.406.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.406.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.412.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.412.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.427.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.427.GLU_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.444.GLU_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.444.GLU_peaks.chrfix.broadPeak
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/OLIG/Mt_Sinai_Peaks
sed 's/chr10/NC_000010.11/g' H.276.OLIG_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.276.OLIG_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.372.OLIG_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.372.OLIG_peaks.chrfix.broadPeak
sed 's/chr10/NC_000010.11/g' H.395.OLIG_peaks.broadPeak | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > H.395.OLIG_peaks.chrfix.broadPeak
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda create --name bedops bedops=2.4.39 python=3.8 -c bioconda -c conda-forge
conda activate bedops
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GLU/Mt_Sinai_Peaks
cut -f 1-3,5 *.chrfix.broadPeak > *.bed
bedops -u *.bed > union.bed
bedops -m *.bed > merge.bed
bedmap --echo --echo-map-id --delim '\t' merge.bed union.bed > glu.merged.peaks.bed
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/OLIG/Mt_Sinai_Peaks
cut -f 1-3,5 *.chrfix.broadPeak > *.bed
bedops -u *.bed > union.bed
bedops -m *.bed > merge.bed
bedmap --echo --echo-map-id --delim '\t' merge.bed union.bed > olig.merged.peaks.bed
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_Peaks
cut -f 1-3,5 *.chrfix.broadPeak > *.bed
bedops -u *.bed > union.bed
bedops -m *.bed > merge.bed
bedmap --echo --echo-map-id --delim '\t' merge.bed union.bed > gaba.merged.peaks.bed
https://academic.hep.com.cn/qb/article/2017/2095-4689/2095-4689-5-3-215.shtml
https://qcb.ucla.edu/wp-content/uploads/sites/14/2017/02/Workshop-10-HiC-D1.pdf
/gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC
—> too much time and space
25 paired end samples (50 files) each 25-50G in compressed state
walltime --> Zhikai: 2G data = 30hrs processing
disk space --> bowtie2 processing of R1 and R2 independently and then merging alignments at the pairing step therefore creating 8 intermediate BAM files for every paired end sample
https://nservant.github.io/HiC-Pro/FAQ.html#why-hic-pro-need-to-be-run-in-two-steps-in-parallel-mode
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/
conda create --name HiCPro python=3.8
conda activate HiCPro
conda install -c davebx hicpro
conda activate HiCPro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/
git clone https://github.com/nservant/HiC-Pro.git
cd HiC-Pro
conda install -c bioconda bx-python
conda install -c conda-forge scipy
conda install -c bioconda pysam
conda install -c bioconda iced
conda install -c bioconda bowtie2 #version 2.4.4
conda install -c bioconda samtools
conda install -c conda-forge wget
conda install -c r r
make configure
make install
# HiC-Pro --help
# usage : HiC-Pro -i INPUT -o OUTPUT -c CONFIG [-s ANALYSIS_STEP] [-p] [-h] [-v]
# Put all input files in a rawdata folder. The input files have to be organized with one folder per sample
/gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro
# Generate bowtie2 indices
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/bowtie2
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro
#conda install -c bioconda bowtie2
bowtie2-build /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna GRCh38.p13.bt2
# Adjust paths in local configuration file
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt
# Run HiC Pro:
# MY_INSTALL_PATH/bin/HiC-Pro -i FULL_PATH_TO_DATA_FOLDER -o FULL_PATH_TO_OUTPUTS -c MY_LOCAL_CONFIG_FILE -p
# Please run HiC-Pro in two steps :
# 1- The following command will launch the parallel workflow through 12 torque jobs:
# qsub HiCPro_step1.sh
# 2- The second command will merge all outputs to generate the contact maps:
# qsub HiCPro_step2.sh
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
# alter config-system.txt
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p
# run on local computer
#MY_INSTALL_PATH/bin/HiC-Pro -i FULL_PATH_TO_DATA_FOLDER -o FULL_PATH_TO_OUTPUTS -c MY_LOCAL_CONFIG_FILE
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt
#time HICPRO_INSTALL_DIR/bin/HiC-Pro -c config_test_latest.txt -i test_data -o hicpro_latest_test
time /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
# make new system config file for directing to cluster
## vim /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts/make_torque_script_slurm.sh ### adjust nodes in SBATCH submission parameters
## vim /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts/config-system.txt
## cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation/.
## adjust slurm commands: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts/make_torque_script_slurm.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
## adjust working directory in scripts from PBS to SLURM (submitdir="${SLURM_SUBMIT_DIR}") --> vim HiCPro_step1_hicpro.sh
sbatch HiCPro_step1_hicpro.sh
sbatch HiCPro_step2_hicpro.sh
# try on sub-sampled fastq file
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#git clone https://github.com/lh3/seqtk.git;
#cd seqtk; make
#mkdir /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/subsample
#mkdir /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/subsample/gaba372
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/seqtk/seqtk sample -s100 /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro/gaba372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1.fastq 0.1 > /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/subsample/gaba372/GABA372_AGTCAA_HWJW2DSXY_L001_001_sub.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/seqtk/seqtk sample -s100 /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/hicpro/gaba372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2.fastq 0.1 > /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/subsample/gaba372/GABA372_AGTCAA_HWJW2DSXY_L001_001_sub.R2.fastq
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/subsample -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
## adjust working directory in scripts from PBS to SLURM (submitdir="${SLURM_SUBMIT_DIR}") --> vim HiCPro_step1_hicpro.sh
sbatch HiCPro_step1_hicpro.sh
# (test) bash-4.4$ /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools flagstat GABA372_AGTCAA_HWJW2DSXY_L001_001_sub.R1.fastq.bwt2glob.bam
# 43284807 + 0 in total (QC-passed reads + QC-failed reads)
# 0 + 0 secondary
# 0 + 0 supplementary
# 0 + 0 duplicates
# 16950518 + 0 mapped (39.16% : N/A)
#############################################
##### Running HiC-Pro #######################
##### Hyejung Won, 09/27/2019 ###############
#############################################
## Ref: https://nservant.github.io/HiC-Pro/USER_CASES.html
## (1) Run HiC-Pro
/proj/hyejunglab/program/HiCPro/HiC-Pro_2.11.4/bin/HiC-Pro \
-i /proj/hyejunglab/collab/fastq/Sample_742-neun \
-o /proj/hyejunglab/collab/hicpro/Sample_742-neun \
-c /proj/hyejunglab/collab/hicpro/config-hicpro_step1.txt -p
## (2) Run the following commands
sbatch HiCPro_step1_Hi-C.sh # mapping + filtering
sbatch HiCPro_step2_Hi-C.sh # statistics + ICE normalization + figures (QC)
## (3) Merge multiple files and normalize matrices
cp /proj/hyejunglab/collab/hicpro/Sample_694-neun/hic_results/data/fastq/fastq.allValidPairs /proj/hyejunglab/collab/hicpro/mergein/neuron/merge/Sample_694-neun.validPairs
cp /proj/hyejunglab/collab/hicpro/Sample_742-neun/hic_results/data/fastq/fastq.allValidPairs /proj/hyejunglab/collab/hicpro/mergein/neuron/merge/Sample_742-neun.validPairs
/proj/hyejunglab/program/HiCPro/HiC-Pro_2.11.4/bin/HiC-Pro \
-i /proj/hyejunglab/collab/mergein/neuron \
-o /proj/hyejunglab/collab/hicpro/mergeout/neuron \
-c /proj/hyejunglab/collab/hicpro/config-hicpro_arima_step2.txt \
-s merge_persample -s build_contact_maps -s ice_norm
### need to properly make HiC-Pro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/
conda create --name HiCPro python=3.8
conda activate HiCPro
conda install -c davebx hicpro
conda activate HiCPro
conda install -c bioconda bx-python
conda install -c conda-forge scipy
conda install -c bioconda pysam
conda install -c bioconda iced
conda install -c bioconda bowtie2
conda install -c bioconda samtools
conda install -c conda-forge wget
conda install -c r r
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/
git clone https://github.com/nservant/HiC-Pro.git
cd HiC-Pro
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/
## Edit config-install.txt file
# PREFIX = /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro
# CLUSTER_SYS = SLURM
vim config-install.txt
make configure
# make -f ./scripts/install/Makefile CONFIG_SYS=./config-install.txt
# make[1]: Entering directory '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro'
# ./scripts/install/install_dependencies.sh -c ./config-install.txt -p /usr/local/bin/ -o /usr/local/bin//HiC-Pro_3.1.0 -q
# Make sure internet connection works for your shell prompt under current user's privilege ...
# Starting HiC-Pro installation !
# Checking dependencies
# - Python libraries ...OK
# - R installation ...OK
# - Bowtie2 installation ...OK
# - Samtools installation ...OK
#
# Checking HiC-Pro configuration
# - Configuration for SLURM system ...OK
#
# done !
# make[1]: Leaving directory '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro'
make install
# (g++ -Wall -O2 -std=c++0x -o build_matrix /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts/src/build_matrix.cpp; mv build_matrix /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts)
# (g++ -Wall -O2 -std=c++0x -o cutsite_trimming /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts/src/cutsite_trimming.cpp; mv cutsite_trimming /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/scripts)
# HiC-Pro installed in /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro !
# edit config-hicpro.txt
# slurm commands, bowtie2 index, genome file, chr file, restriction file
# move genome and chr file to annotation directory
# cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation/.
# cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
# move hic data into hic/fastq/opioid.samples directory (one folder for each sample)
# /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/opioid.samples -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p
# Run HiC-Pro 3.0.0 parallel mode
# The following command will launch the parallel workflow through 24 torque jobs:
# sbatch HiCPro_step1_hicpro.sh
# The following command will merge the processed data and run the remaining steps per sample:
# sbatch HiCPro_step2_hicpro.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples
# alter shell script to include accound name: #SBATCH -A SYB105
sbatch HiCPro_step1_hicpro.sh
sbatch HiCPro_step2_hicpro.sh
## change to HiCPro_step1_hicpro_mapping.sh
## change to HiCPro_step2_hicpro_mapping.sh <-- doesn't work because script didn't do HiC processing...
### try re-running step1 with nodes? N=2
################ Takes weeks to run... FUCK ME!!!!!!!!!!!!
## The answer is CADES... https://docs.cades.ornl.gov/#condos/connecting/
## run on test data?
## can we process on Chr 1 only?? Don't we need to do the full alignment and then just take the first chromosome? Instead take a small subset of the fastq files (only 100,000 reads?) and run on that...
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
conda install -c bioconda seqkit
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
# move hic data into hic/fastq/subset.samples directory (one folder for each sample - only do GABA372 samples 1 and 2)
# /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/
# seqkit sample -p 0.01 -s 123 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/opioid.samples/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1.fastq.gz -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/subset.samples/GABA372/sub_GABA372_AGTCAA_HWJW2DSXY_L001_001.R1.fastq.gz
# [INFO] 4326514 sequences outputted
# seqkit sample -p 0.01 -s 123 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/opioid.samples/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2.fastq.gz -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/subset.samples/GABA372/sub_GABA372_AGTCAA_HWJW2DSXY_L001_001.R2.fastq.gz
# seqkit sample -p 0.01 -s 123 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/opioid.samples/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1.fastq.gz -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/subset.samples/GABA372/sub_GABA372_AGTCAA_HWJW2DSXY_L002_001.R1.fastq.gz
# seqkit sample -p 0.01 -s 123 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/opioid.samples/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R2.fastq.gz -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/subset.samples/GABA372/sub_GABA372_AGTCAA_HWJW2DSXY_L002_001.R2.fastq.gz
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/subset.samples -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/subset.samples -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p
#### Test dataset
## Get the data. Will download a test_data folder and a configuration file
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/test.data
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/test.data
wget https://zerkalo.curie.fr/partage/HiC-Pro/HiCPro_testdata.tar.gz && tar -zxvf HiCPro_testdata.tar.gz
## Run HiC-Pro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
time /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/test.data/test_data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/test.data -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/test.data
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/fastq/opioid.samples -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s mapping -s quality_checks
# Run HiC-Pro 3.1.0 parallel mode
# The following command will launch the parallel workflow through 24 torque jobs:
# sbatch HiCPro_step1_hicpro.sh
# The following command will merge the processed data and run the remaining steps per sample:
# sbatch HiCPro_step2_hicpro.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples
# vim HiCPro_step1_hicpro.sh ## need to add -A SYB105
# vim HiCPro_step2_hicpro.sh ## need to add -A SYB105
sbatch HiCPro_step1_hicpro.sh
## quality checks
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples/bowtie_results/bwt2_global -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s quality_checks
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples
sbatch HiCPro_step2_hicpro.sh
# change to HiCPro_step2_hicpro_QC.sh
## processing
** need to samtools sort, index and view bam files and then run
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/bin/samtools sort *.bam
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/bin/samtools index *.bam
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/bin/samtools view -c *.bam
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples/bowtie_results/bwt2_global -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s proc_hic
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/hicpro/opioid.samples
sbatch HiCPro_step1_hicpro.sh
# change to HiCPro_step1_hicpro_proc.sh
## merge
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hic/
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/<INPUT_.validPairs > -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s merge_persample
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
## contact maps
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/<INPUT_.validPairs > -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
## ICE normalization
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/<INPUT_matrix> -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
https://nservant.github.io/HiC-Pro/UTILS.html#utils
# gaba372
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372.2/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372.2/GABA372_AGTCAA_HWJW2DSXY_L002_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372.3/GABA372_AGTCAA_HWJW2DSXY_L003_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372.3/GABA372_AGTCAA_HWJW2DSXY_L003_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372.4/GABA372_AGTCAA_HWJW2DSXY_L004_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba372.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba372.4/GABA372_AGTCAA_HWJW2DSXY_L004_001.R2.fastq
# gaba376
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376.2/GABA376_CCGTCC_HWJW2DSXY_L002_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376.2/GABA376_CCGTCC_HWJW2DSXY_L002_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376.3/GABA376_CCGTCC_HWJW2DSXY_L003_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376.3/GABA376_CCGTCC_HWJW2DSXY_L003_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376.4/GABA376_CCGTCC_HWJW2DSXY_L004_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/gaba376.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/gaba376.4/GABA376_CCGTCC_HWJW2DSXY_L004_001.R2.fastq
# glu372
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372.2/Glu372_CTTGTA_HWJW2DSXY_L002_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372.2/Glu372_CTTGTA_HWJW2DSXY_L002_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372.3/Glu372_CTTGTA_HWJW2DSXY_L003_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372.3/Glu372_CTTGTA_HWJW2DSXY_L003_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372.4/Glu372_CTTGTA_HWJW2DSXY_L004_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu372.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu372.4/Glu372_CTTGTA_HWJW2DSXY_L004_001.R2.fastq
# glu376
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376.2/Glu376_ATGTCA_HWJW2DSXY_L002_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376.2/Glu376_ATGTCA_HWJW2DSXY_L002_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376.3/Glu376_ATGTCA_HWJW2DSXY_L003_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376.3/Glu376_ATGTCA_HWJW2DSXY_L003_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376.4/Glu376_ATGTCA_HWJW2DSXY_L004_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/glu376.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/glu376.4/Glu376_ATGTCA_HWJW2DSXY_L004_001.R2.fastq
# olig372
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372.2/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372.2/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372.3/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372.3/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372.4/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig372.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig372.4/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R2.fastq
# olig376
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376.2/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376.2 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376.2/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376.3/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376.3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376.3/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R2.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376.4/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R1.fastq
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/split_reads.py --results_folder /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split/olig376.4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata/olig376.4/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R2.fastq
https://nservant.github.io/HiC-Pro/MANUAL.html - specify mapping stepwise in HiC-Pro pipeline
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/rawdata.split -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s mapping -s quality_checks
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
## quality checks
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/bwt2 -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s quality_checks
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
sbatch HiCPro_step2_hicpro.sh
# make into HiCPro_step2_qc_hicpro.sh
## processing
** need to samtools sort, index and view bam files and then run
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/bin/samtools sort *.bam
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/bin/samtools index *.bam
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/bin/samtools view -c *.bam
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/bwt2 -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s proc_hic
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
sbatch HiCPro_step1_hicpro.sh
# make into HiCPro_step1_process_hicpro.1.sh HiCPro_step1_process_hicpro.2.sh HiCPro_step1_process_hicpro.3.sh HiCPro_step1_process_hicpro.4.sh
** running 1-20 (15 Sep 2021)
## merge
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/<INPUT_.validPairs > -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s merge_persample
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
## contact maps
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/<INPUT_.validPairs > -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
## ICE normalization
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/bowtie_results/<INPUT_matrix> -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/hicpro/
https://github.com/aidenlab/juicer https://github.com/aidenlab/juicer/wiki/Installation
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# conda install -c bioconda java-jdk
# java -jar juicer_tools.jar (command...) [flags...] <parameters...>`
# Usage: juicer.sh [-g genomeID] [-d topDir] [-q queue] [-l long queue] [-s site]
# [-a about] [-R end] [-S stage] [-p chrom.sizes path]
# [-y restriction site file] [-z reference genome file]
# [-C chunk size] [-D Juicer scripts directory]
# [-Q queue time limit] [-L long queue time limit] [-e] [-h] [-x]
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer
#mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/fastq
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts
# need to make sure files are labeled as *_R*.fastq*
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/common/juicer.sh -g hg38 -d /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer -p /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -y /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/restriction_sites/hg38_GATC_GANTC.txt -z /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna -D /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts
# submit as job on andes
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.test.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.1.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.1reps/juicer.1reps.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.all.sh
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J juicer.test
#SBATCH -N 2
#SBATCH -t 48:00:00
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/common/juicer.sh -g hg38 -d /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.all -p /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -y /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/restriction_sites/hg38_GATC_GANTC.txt -z /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna -D /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/
### when I ran all replicates for a single sample for 48hours... all alignments are completed and merged but then it is killed...
# Relaunch via the same script. Type juicer.sh [options] -S stage where "stage" is one of merge, dedup, final, postproc, or early. "merge" is for when alignment has finished but merged_sort hasn't been created; "dedup" is for when merged_sort is there but not merged_nodups (this will relaunch all dedup jobs); "final" is for when merged_nodups is there and you want the stats and hic files; "postproc" is for when you have the hic files and just want feature annotations; and "early" is for early exit, before hic file creation. If your jobs failed at the alignment stage, run relaunch_prep.sh and then run juicer.sh
# juicer.sh [options] -S stage
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/common/juicer.sh -g hg38 -d /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.1reps -p /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -y /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/restriction_sites/hg38_GATC_GANTC.txt -z /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna -D /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/ -S dedup
## make a directory for each sample with all replicates fastq files... make an sbatch script that lists all sbatch scripts for all samples
juicer.1reps = GABA372
juicer.2reps = GABA376
juicer.3reps = Glu372
juicer.4reps = Glu376
juicer.5reps = OLIG372
juicer.6reps = OLIG376
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.align.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.dedup.submit.sh
https://www.encodeproject.org/documents/75926e4b-77aa-4959-8ca7-87efcba39d79/@@download/attachment/comp_doc_7july2018_final.pdf - One lane of HiSeq data comprises approximately 150 million raw reads - About 75% of the time, each read in a read pair will align to a single site in the genome. We call such read pairs “normal.” - Another 20% of read pairs are “chimeric”. This means that at least one of the two reads comprises multiple subsequences, each of which align to different parts of the genome - These “unambiguous” chimeric read pairs comprise roughly 15% of all read pairs and are included in our maps as ligation junctions between locus A and locus B. All other chimeric read pairs are “ambiguous” and are not included in our Hi-C maps - These “unambiguous” chimeric read pairs comprise roughly 15% of all read pairs and are included in our maps as ligation junctions between locus A and locus B. All other chimeric read pairs are “ambiguous” and are not included in our Hi-C maps
** Loop Resolution Hi-C Heatmap Read Threshold. In order for a Hi-C map to be considered loop resolution, the number of total sequenced reads must be at least 2 billion paired end reads for any given experiment. Note that an experiment can be comprised of multiple libraries.
II.a.1.viii.Summary of Hi-C Computational Standards: Intra Chromosomal >40.0% Inter Chromosomal <40.0% Chimeric Ambiguous <10.0% Chimeric Paired 10.0-30.0%
Fail Marginal Pass
Alignable Reads <75.0% 75.0-90.0% >90.0% Duplicates — >40.0% <40.0% Intra-fragment >20.0% 10.0-20.0% <10.0% Hi-C Contacts <20.0% 20.0-50.0% >50.0% Intra Short Range (<20 Kb) >60.0% 30.0-60.0% <30.0% Intra Long Range (≥20 Kb) <20.0% 20.0-35.0% >35.0% Ligations <5.00% 5.00-25.0% >25.0%
What causes high levels of chimeric reads????
juicer1.reps [inter.txt]
# Sequenced Read Pairs: 1,626,483,097
# Normal Paired: 705,287,133 (43.36%)
# Chimeric Paired: 712,972,260 (43.84%) <-- A LOT of chimeric paired reads?? (should be closer to 70-80% normal paired and 15% chimeric)
# Chimeric Ambiguous: 185,490,930 (11.40%)
# Unmapped: 22,732,774 (1.40%)
# Ligation Motif Present: 0 (0.00%)
# Alignable (Normal+Chimeric Paired): 1,418,259,393 (87.20%)
juicer.3reps
# Sequenced Read Pairs: 1,782,022,300
# Normal Paired: 757,834,565 (42.53%)
# Chimeric Paired: 798,015,464 (44.78%)
# Chimeric Ambiguous: 205,029,885 (11.51%)
# Unmapped: 21,142,386 (1.19%)
# Ligation Motif Present: 0 (0.00%)
# Alignable (Normal+Chimeric Paired): 1,555,850,029 (87.31%)
grep '@' GABA372_AGTCAA_HWJW2DSXY_L001_001_R1.fastq | wc -l
# 432,868,186
Stella: - Re replicates, we had 4 lanes on a Novaseq 6000 flow cell (PE150). Thus, for each sample, we have 400M reads per lane X 4lanes=1.6 billion reads, as you said (see the de-multiplexing file attached). - For each cell type (Glu, GABA, OLIG) we had 2 replicate subjects (cases 372 and 376). So, we can merge these 2 biological replicates for each cell type, obtaining 2x4x400M ~3billion reads. Theoretically, should be enough for loops.
juicer.gaba = GABA372 + GABA376
juicer.glu = Glu372 + Glu376
juicer.olig = OLIG372 + OLIG376
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
smodule load gcc/6.5.0
module load cuda
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.chimeric.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.merge.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.dedup.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.final.reps.submit.sh
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/inter_30_loops/ /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/.
######## Determined the issue! When combining all reps into a single directory to run... the 48hr time limit does not allow for the completion of alignments for all samples and then the next stage you can specify is merge and so it is merging without all of the raw data! So... run initial step with each sample separate and then transfer the split alignment files to the merged directory and run from the chimeric step!
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.reps.submit.sh
## run on each individual sample and then transfer the splits directory contents to the merged directory for that cell type
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.chimeric.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.merge.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.dedup.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.final.reps.submit.sh
## sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/juicer.postproc.reps.submit.sh
*** Found the real issue is that the dedup step is cutting short due to time limit?... how to allow longer time on andes?? not time limit, memory??
--> Ask Kyle? Re-submit on Wednesday after maintenance
--> submit shorter walltime with output and error file declared to see if error file produces anything? try on gaba <-- NOPE
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/aligned
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/juicer.gaba.dedup.test.sh
--> also try removing restriction site fragments filepath... try on glu <-- NOPE
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/aligned
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/juicer.glu.dedup.test.sh
--> lastly try breaking the merged_sort.txt file by chromosome and seeing if the job can run through that way? try on olig******
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.olig/aligned
mv merged_sort.txt merged_sort_all.txt
awk '{if ($2 == "NC_000001.11") print $0}' merged_sort_all.txt > merged_sort.txt
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.olig/juicer.olig.dedup.test.sh
** Could not initialize class jcuda.driver.JCudaDriver
# run dedups step for each chromosome individully?... make separate directories?
## first try with a few chromosomes
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/aligned
mv merged_sort.txt merged_sort_all.txt
awk '{if ($2 == "NC_000001.11" || $2 == "NC_000002.12") print $0}' merged_sort_all.txt > merged_sort.txt
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/juicer.gaba.dedup.sh
## try chr 2 for olig and glu
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.olig/aligned
mv inter.hic chr1.inter.hic
mv inter_30.txt chr1.inter_30.txt
mv inter_30_hists.m chr1.inter_30_hists.m
mv inter_30.hic chr1.inter_30.hic
mv inter_30_contact_domains/5000_blocks.bedpe inter_30_contact_domains/chr2.5000_blocks.bedpe
awk '{if ($2 == "NC_000002.12") print $0}' merged_sort_all.txt > merged_sort.txt
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.olig/juicer.olig.dedup.test.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/aligned
## do same commands as above
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/juicer.glu.dedup.test.sh
# try running gaba with Long queue time??
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/aligned
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/juicer.gaba.dedup.test.sh
# make separate directories for each chromosome merged_sort.txt file... what other files are needed?
##### have i been using the wrong version of the juicer.sh script the whole fucking time???? generated for a single CPU instead of parallelizing... they are named the same but are different versions on the github repo??
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/aligned
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/juicer.gaba.dedup.parallel.sh
#### Try Creating a "mega" map: https://github-wiki-see.page/m/aidenlab/juicer/wiki/Usage
To create statistics and a hic file from a series of replicates, you can use the mega.sh script.
Create the following directory structure (the files can be soft-linked):
/opt/juicer/work/HeLa/HIC001/aligned/merged_nodups.txt
/opt/juicer/work/HeLa/HIC002/aligned/merged_nodups.txt
/opt/juicer/work/HeLa/HIC003/aligned/merged_nodups.txt
/opt/juicer/work/HeLa/HIC001/aligned/inter.txt
/opt/juicer/work/HeLa/HIC002/aligned/inter.txt
/opt/juicer/work/HeLa/HIC003/aligned/inter.txt
cd /opt/juicer/work/HeLa/
And then run mega.sh with the same kinds of flags as with Juicer:
/opt/juicer/scripts/mega.sh -g hg19 -s DpnII
A "mega" folder will be created at /opt/juicer/work/HeLa/mega and underneath that, the aligned folder will contain the results.
#### go back and try to run the dedups step for the 6 different replicate sets?
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.2reps/
sbatch juicer.2reps.dedup.sh
sbatch juicer.2reps.dedup.parallel.sh
22 July discussion: - Stella and Alex talked with Arima… - Arima is a 2-enzyme chemistry protocol (reduces the necessary number of reads) - they typically suggest 600-700M paired end reads to get 5kb resolution - side by side QC concerning… MiSeq vs. NovaSeq - high levels of duplicate reads –> diminishing returns from the high level of sequencing - chimeric reads = due to 150bp pe reads (longer reads, more changes to capture the ligation junction)… MiSeq was 75bp - chimeric ambiguous = encompassing what would otherwise be unmapped? - Long Range = 452,000,000 (generally work with 220,000,000) - Loops… should expect at least 5kb resolution if not finer detail - hic file generated by juicer… by default should contain all necessary loops for proper resolution - if we want to get even deeper resolution we need to adjust parameters - Hiccup command requires GPUs and a lot of RAM… generates a temp file that can cause the pipeline to crash? - Use hg19 reference - Look at C9orf72 when can view in Juicebox
# trying to understand the output... why am I only getting the contact domains and not the loops :/
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
hic <- read.delim("5000_blocks.bedpe", header=T, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Juicer")
hic2 <- read.delim("gaba.5000_blocks.bedpe", header=T, sep="\t")
library(tidygenomics)
library(dplyr)
library(tidyr)
hic$x.length <- hic$x2 - hic$x1
hic$y.length <- hic$y2 - hic$y1
hic$dist <- hic$y1 - hic$x2
hic2$x.length <- hic2$x2 - hic2$x1
hic2$y.length <- hic2$y2 - hic2$y1
hic2$dist <- hic2$y1 - hic2$x2
mean(na.omit(hic$x.length))
# 358423.3
mean(na.omit(hic2$x.length))
# 326128
min(na.omit(hic2$x.length))
# 65000
min(na.omit(hic$x.length))
# 60000
## Looking into error files:
# Finished preprocess
#
# Calculating norms for zoom BP_2500000
# Calculating norms for zoom BP_1000000
# Calculating norms for zoom BP_500000
# Calculating norms for zoom BP_250000
# Calculating norms for zoom BP_100000
# Calculating norms for zoom BP_50000
# Calculating norms for zoom BP_25000
# Calculating norms for zoom BP_10000
# Calculating norms for zoom BP_5000
# Writing expected
# Writing norms
# Finished writing norms
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer//scripts/common/juicer_tools is post-processing Hi-C for hg38
# Data read from /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/aligned/inter_30.hic.
# Motifs read from /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer//references/motif
#
# ARROWHEAD:
#
# Picked up _JAVA_OPTIONS: -Xmx16384m
# Reading file: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/aligned/inter_30.hic
# Default settings for 5kb being used
# max 2.0
# 50%
# 100%
# 439 domains written to file: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/aligned/inter_30_contact_domains/5000_bl
# ocks.bedpe
# Arrowhead complete
#
# HiCCUPS:
#
# Picked up _JAVA_OPTIONS: -Xmx16384m
# Reading file: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.glu/aligned/inter_30.hic
# No valid configurations specified, using default settings
# Default settings for 5kb, 10kb, and 25kb being used
# Running HiCCUPS for resolution 5000
# Data not available for NC_000002.12 at 5000 resolution
# Jul 27, 2021 11:13:10 PM jcuda.utils.KernelLauncher preparePtxFile
# SEVERE: errorMessage:
# In file included from /sw/andes/spack-envs/base/opt/linux-rhel8-x86_64/gcc-8.3.1/cuda-10.2.89-e3gecwubsl7bnpgrtfk5chm24t3euot5/include/cuda_runtime.h:8
# 3,
# from <command-line>:
# /sw/andes/spack-envs/base/opt/linux-rhel8-x86_64/gcc-8.3.1/cuda-10.2.89-e3gecwubsl7bnpgrtfk5chm24t3euot5/include/crt/host_config.h:138:2: error: #error
# -- unsupported GNU version! gcc versions later than 8 are not supported!
# 138 | #error -- unsupported GNU version! gcc versions later than 8 are not supported!
# | ^~~~~
############ maybe try running HiCCUPS separately (not within the pipeline)???
# https://www.bioinformatics.babraham.ac.uk/projects/hicup/read_the_docs/html/index.html
HiCUP (Hi-C User Pipeline) comprises six Perl scripts for analysing Hi-C sequence data:
HiCUP Digester - creates a digested reference genome
HiCUP - executes sequentially the scripts below
HiCUP Truncater - cuts reads at the putative Hi-C ligation junction
HiCUP Mapper - aligns read pairs independently to a reference genome
HiCUP Filter - removes commonly encountered Hi-C artefacts
HiCUP Deduplicator - removes (retaining one copy) putative PCR duplicates
############ or maybe it is a CUDA issue... wrong version... juicer compiled with cuda 7 or 7.5 and andes has cuda/10.2.89
# https://github.com/aidenlab/juicer
# scp /Users/27n/Downloads/JCuda-All-10.0.0.zip noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/.
module load gcc/6.5.0 cuda
–> altering the gpu SLURM script to be usable on Andes… - download proper CUDA 10 compiled JCuda - move SLURM/scripts to base scripts directory - alter juicer.sh script to remove other lab specific criteria /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer.sh - change –ntasks to -N - start anaconda and be sure proper dependencies are available (bwa, java, gpu)
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
#conda create --name juicer bwa=0.7.17 java-jdk=8.0.112 python=3.8 -c bioconda -c conda-forge
conda activate juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.olig/
sbatch juicer.olig.test.sh
sbatch juicer.olig.merge.test.sh
# to cancel all jobs running:
## squeue -u noshayjm | awk '{print $1}' | xargs -n 1 scancel
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/
sbatch juicer.gaba.dedup.test.sh
sbatch juicer.gaba.merge.test.sh
# ***! Error! The sorted file and dups/no dups files do not add up, or were empty.
### The issue is still at the merge/dedup step...
# vim /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer.sh
## --mem-per-cpu=0
## Tab errors on my side... have Mikaela go in and fix them... /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_mika.sh
##### move split directory output from old (single cpu) runs and start from merge step with new script...
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.gaba/
sbatch juicer.gaba.test.sh
https://github.com/deeptools/HiCExplorer
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#conda deactivate
#conda install hicexplorer -c bioconda -c conda-forge
conda create --name hicexplorer hicexplorer=3.6 python=3.8 -c bioconda -c conda-forge
conda activate hicexplorer
# Estimate quality of data
## run hicQuickQC
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.4reps/splits
mkdir hicQuickQC
hicQuickQC --samFiles Glu376_ATGTCA_HWJW2DSXY_L001_001.fastq.sam Glu376_ATGTCA_HWJW2DSXY_L002_001.fastq.sam Glu376_ATGTCA_HWJW2DSXY_L003_001.fastq.sam Glu376_ATGTCA_HWJW2DSXY_L004_001.fastq.sam --QCfolder hicQuickQC
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
df <- read.delim("bin.cpm.matrix.id.chr1.txt", header=F, sep="\t")
val.df <- as.matrix(df[,4])
scales <- seq(1,48,2)
wCoefs <- cwt(val.df, scales = scales, wavelet = "mexh")
wCoefs <- cbind(as.vector(val.df), wCoefs)
## Plot the 2-D CWT coefficients as image (It may take a while!)
xTickInterval <- 1000
plotRange <- c(5000, 11000)
pdf("atac.wavelet.pdf")
image(plotRange[1]:plotRange[2], scales, wCoefs[plotRange[1]:plotRange[2],], col=terrain.colors(256), axes=FALSE, xlab='m/z index', ylab='CWT coefficient scale', main='CWT coefficients')
axis(1, at=seq(plotRange[1], plotRange[2], by=xTickInterval))
axis(2, at=c(1, seq(10, 64, by=10)))
box()
dev.off()
colnames(temp.wCoefs) <- c(0, scales)
temp.localMax <- getLocalMaximumCWT(temp.wCoefs)
temp.ridgeList <- getRidge(temp.localMax)
SNR.Th <- 3
nearbyPeak <- TRUE
temp.majorPeakInfo <- identifyMajorPeaks(temp.df, temp.ridgeList, temp.wCoefs, SNR.Th = SNR.Th, nearbyPeak=nearbyPeak)
## Plot the identified peaks
peakIndex <- temp.majorPeakInfo$peakIndex
data(temp.df)
SNR.Th <- 3
pdf("atac.wavelet.peak.pdf")
plotLocalMax(temp.localMax, temp.wCoefs)
dev.off()
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
pdf(file='atac.wavelet.scale.pdf')
par(mfrow=c(2,1))
scales <- seq(1, 60, 2)
wCoefs <- cwt(val.df, scales = scales, wavelet = "mexh")
image(1:length(val.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='ATAC-seq')
scales <- seq(1, 100, 2)
wCoefs <- cwt(val.df, scales = scales, wavelet = "mexh")
image(1:length(val.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='ATAC-seq')
dev.off()
pdf(file='atac.wavelet.scale100.pdf')
par(mfrow=c(2,1))
scales <- seq(1, 100, 2)
wCoefs <- cwt(val.df, scales = scales, wavelet = "mexh")
image(1:length(val.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='ATAC-seq')
dev.off()
pdf(file='atac.wavelet.scale48.pdf')
par(mfrow=c(2,1))
scales <- seq(1, 48, 2)
wCoefs <- cwt(val.df, scales = scales, wavelet = "mexh")
image(1:length(val.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='ATAC-seq')
dev.off()
library(dplyr)
library(reshape2)
library(MassSpecWavelet)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
df <- read.delim("bin.cpm.matrix.id.chr1.txt", header=F, sep="\t")
val.df <- as.matrix(df[,4])
scales <- seq(1, 100, 2)
wCoefs <- cwt(val.df, scales=scales, wavelet='mexh')
pdf(file='cwt.atac.chr1.plot.pdf')
par(mfrow=c(4,1))
scales <- seq(1, 100, 2)
image(1:length(val.df), scales, wCoefs, col=terrain.colors(256), axes=FALSE, ylab='CWT coefficient scale', main='CWT opioid ATAC coefficients')
dev.off()
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
library(dplyr)
library(reshape2)
library(wmtsa)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
df <- read.delim("bin.cpm.matrix.id.chr1.txt", header=F, sep="\t")
val.df <- as.matrix(df[,4])
# Run DWT
modwt <- wavMODWT(val.df)
modwt.df <- as.matrix(modwt)
modwt.label <- data.frame(label = row.names(modwt.df), modwt.df)
modwt.name <- modwt.label %>% separate(label, c("name", "number"), sep = "[^[:alnum:]]+")
colnames(modwt.name) <- c("scale", "window", "dwt")
pdf("modwt.atac.chr1.plot.pdf")
plot(modwt)
dev.off()
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7643171/ - The normalized read count matrix from voomWithQualityWeights was then modeled by fitting weighted least-squares linear regression models estimating the effect of the right-hand side variables on the accessibility of each OCR: chromatin accessibility ~ cell type:brain region+gender+FRiP. In so doing, we model both cell type and brain region effects. In this model, the effect on the chromatin accessibility of an OCR can then be assessed by testing the coefficient of interest for being non-vanishing using the linear regression utilities implemented in limma.
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/atac.peaks")
atac <- read.delim("GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1_peaks.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq")
chip <- read.delim("H.372.GLU_peaks.chrfix.broadPeak", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
hic <- read.delim("5000_blocks.bedpe", header=T, sep="\t")
library(tidygenomics)
library(dplyr)
library(tidyr)
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.id <- separate(gene, "info", c("gene", "id"), sep=" ")
atac.df <- atac[,1:4]
colnames(atac.df) <- c("chr2", "start2", "end2", "peak.id")
chip.df <- chip[,1:4]
colnames(chip.df) <- c("chr2", "start2", "end2", "peak.id")
hic.df <- hic[2:nrow(hic),c(1:6,12)]
colnames(hic.df) <- c("chr", "start", "end", "chr2", "start2", "end2", "hic.score")
hic.df$hic.id <- seq.int(nrow(hic.df))
gene.arc <- subset(gene.id, gene.id$id == "ARC;")
gene.npas <- subset(gene.id, gene.id$id == "NPAS4;")
arc.hic <- genome_intersect(hic.df, gene.arc, by=c("chr", "start", "end"))
npas.hic <- genome_intersect(hic.df, gene.npas, by=c("chr", "start", "end"))
arc.hic.atac <- genome_intersect(arc.hic, atac.df, by=c("chr2", "start2", "end2"))
npas.hic.atac <- genome_intersect(npas.hic, atac.df, by=c("chr2", "start2", "end2"))
arc.hic.chip <- genome_intersect(arc.hic, chip.df, by=c("chr2", "start2", "end2"))
npas.hic.chip <- genome_intersect(npas.hic, chip.df, by=c("chr2", "start2", "end2"))
length(unique(arc.hic.atac$hic.id))
# 2
length(unique(arc.hic.atac$peak.id))
# 24
unique(arc.hic.atac$peak.id)
# [1] "Peak_46641" "Peak_46647" "Peak_46648" "Peak_46649" "Peak_46650"
# [6] "Peak_46651" "Peak_46652" "Peak_46653" "Peak_46657" "Peak_46658"
# [11] "Peak_46662" "Peak_46664" "Peak_46665" "Peak_46669" "Peak_46670"
# [16] "Peak_46671" "Peak_46672" "Peak_46673" "Peak_46674" "Peak_46675"
# [21] "Peak_46676" "Peak_46678" "Peak_46680" "Peak_46682"
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/atac.peaks")
glu.atac.control <- read.delim("GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
glu.atac.opioid <- read.delim("GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
gaba.atac.control <- read.delim("GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
gaba.atac.opioid <- read.delim("GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
olig.atac.control <- read.delim("OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
olig.atac.opioid <- read.delim("OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
glu <- read.delim("Glu_intersection.csv", header=T, sep=",")
gaba <- read.delim("Gaba_intersection.csv", header=T, sep=",")
olig <- read.delim("Olig_intersection.csv", header=T, sep=",")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
glu.log2.pval.up <- subset(glu, glu$Significance == "UP" & abs(glu$log2FC) > 2 & glu$Padj < 0.05)
# 15 15 PRSS35 UP 2.093866 0.003142919
# 76 76 FRMD7 UP 2.700107 0.043355159
glu.log2.pval.down <- subset(glu, glu$Significance == "DOWN" & abs(glu$log2FC) > 2 & glu$Padj < 0.05)
# 163 163 GNL1_2 DOWN -2.167138 0.02578113
glu.up1 <- subset(gene.id, gene.id$gid == "PRSS35")
glu.up2 <- subset(gene.id, gene.id$gid == "FRMD7")
glu.down <- subset(gene.id, gene.id$gid == "GNL1_2")
glu.up1$chr2 <- glu.up1$chr
glu.up1$start2 <- glu.up1$start
glu.up1$end2 <- glu.up1$end
colnames(glu.atac.control) <- c("chr", "start", "end", "peak", "value", "dot", "v7", "v8", "v9", "v10")
colnames(glu.atac.opioid) <- c("chr2", "start2", "end2", "peak", "value", "dot", "v7", "v8", "v9", "v10")
glu.up1.atac <- genome_intersect(glu.up1, glu.atac.control, by=c("chr", "start", "end"))
# 2 peaks intersect with the gene in the control sample
glu.up1.atac.opioid <- genome_intersect(glu.up1.atac, glu.atac.opioid, by=c("chr2", "start2", "end2"))
# 4 peaks intersect with the gene in the opioid sample
glu.up1$chr2 <- glu.up1$chr
glu.up1$prom.start <- glu.up1$start - 2000
glu.up1$prom.end <- glu.up1$start
glu.up1$prom.start2 <- glu.up1$start - 2000
glu.up1$prom.end2 <- glu.up1$start
colnames(glu.atac.control) <- c("chr", "prom.start", "prom.end", "peak", "value", "dot", "v7", "v8", "v9", "v10")
colnames(glu.atac.opioid) <- c("chr2", "prom.start2", "prom.end2", "peak", "value", "dot", "v7", "v8", "v9", "v10")
glu.up1.atac.promoter <- genome_intersect(glu.up1, glu.atac.control, by=c("chr", "prom.start", "prom.end"))
glu.up1.atac.promoter.opioid <- genome_intersect(glu.up1.atac.promoter, glu.atac.opioid, by=c("chr2", "prom.start2", "prom.end2"))
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/atac.peaks")
glu.atac.control <- read.delim("GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
glu.atac.opioid <- read.delim("GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
gaba.atac.control <- read.delim("GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
gaba.atac.opioid <- read.delim("GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
olig.atac.control <- read.delim("OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
olig.atac.opioid <- read.delim("OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
# 220004, 154360, 198303, 134045, 127091, 120009
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
glu.DE <- read.delim("Glu_intersection.csv", header=T, sep=",")
gaba.DE <- read.delim("Gabba_intersection.csv", header=T, sep=",")
olig.DE <- read.delim("Olig_intersection.csv", header=T, sep=",")
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
gaba.TMM <- read.delim("Gaba_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM <- read.delim("Glu_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
olig.TMM <- read.delim("Olig_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM.control <- glu.TMM[,c(1,6)]
glu.TMM.opioid <- glu.TMM[,c(1,9)]
gaba.TMM.control <- gaba.TMM[,c(1,6)]
gaba.TMM.opioid <- gaba.TMM[,c(1,9)]
olig.TMM.control <- olig.TMM[,c(1,6)]
olig.TMM.opioid <- olig.TMM[,c(1,9)]
# glu
colnames(glu.TMM.control) <- c("gid", "glu.276")
colnames(glu.TMM.opioid) <- c("gid", "glu.302")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.id, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.id, by="gid")
glu.atac.control$chr <- glu.atac.control$V1
glu.atac.control$start <- glu.atac.control$V2 - 2000
glu.atac.control$end <- glu.atac.control$V3 + 2000
glu.atac.control.gene <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("chr", "start", "end"))
glu.atac.opioid$chr <- glu.atac.opioid$V1
glu.atac.opioid$start <- glu.atac.opioid$V2 - 2000
glu.atac.opioid$end <- glu.atac.opioid$V3 + 2000
glu.atac.opioid.gene <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("chr", "start", "end"))
glu.control.gene.count <- glu.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.gene.count.uniq <- unique(glu.control.gene.count[,c(12,13,22)])
glu.opioid.gene.count <- glu.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.gene.count.uniq <- unique(glu.opioid.gene.count[,c(12,13,22)])
glu.count <- inner_join(glu.control.gene.count.uniq, glu.opioid.gene.count.uniq, by="gid")
glu.summary <- glu.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
# gaba
colnames(gaba.TMM.control) <- c("gid", "gaba.276")
colnames(gaba.TMM.opioid) <- c("gid", "gaba.302")
gaba.TMM.control.gid <- inner_join(gaba.TMM.control, gene.id, by="gid")
gaba.TMM.opioid.gid <- inner_join(gaba.TMM.opioid, gene.id, by="gid")
gaba.atac.control$chr <- gaba.atac.control$V1
gaba.atac.control$start <- gaba.atac.control$V2 - 2000
gaba.atac.control$end <- gaba.atac.control$V3 + 2000
gaba.atac.control.gene <- genome_intersect(gaba.atac.control, gaba.TMM.control.gid, by=c("chr", "start", "end"))
gaba.atac.opioid$chr <- gaba.atac.opioid$V1
gaba.atac.opioid$start <- gaba.atac.opioid$V2 - 2000
gaba.atac.opioid$end <- gaba.atac.opioid$V3 + 2000
gaba.atac.opioid.gene <- genome_intersect(gaba.atac.opioid, gaba.TMM.opioid.gid, by=c("chr", "start", "end"))
gaba.control.gene.count <- gaba.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
gaba.control.gene.count.uniq <- unique(gaba.control.gene.count[,c(12,13,22)])
gaba.opioid.gene.count <- gaba.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
gaba.opioid.gene.count.uniq <- unique(gaba.opioid.gene.count[,c(12,13,22)])
gaba.count <- inner_join(gaba.control.gene.count.uniq, gaba.opioid.gene.count.uniq, by="gid")
gaba.summary <- gaba.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(gaba.276/gaba.302))
# olig
colnames(olig.TMM.control) <- c("gid", "olig.276")
colnames(olig.TMM.opioid) <- c("gid", "olig.302")
olig.TMM.control.gid <- inner_join(olig.TMM.control, gene.id, by="gid")
olig.TMM.opioid.gid <- inner_join(olig.TMM.opioid, gene.id, by="gid")
olig.atac.control$chr <- olig.atac.control$V1
olig.atac.control$start <- olig.atac.control$V2 - 2000
olig.atac.control$end <- olig.atac.control$V3 + 2000
olig.atac.control.gene <- genome_intersect(olig.atac.control, olig.TMM.control.gid, by=c("chr", "start", "end"))
olig.atac.opioid$chr <- olig.atac.opioid$V1
olig.atac.opioid$start <- olig.atac.opioid$V2 - 2000
olig.atac.opioid$end <- olig.atac.opioid$V3 + 2000
olig.atac.opioid.gene <- genome_intersect(olig.atac.opioid, olig.TMM.opioid.gid, by=c("chr", "start", "end"))
olig.control.gene.count <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
olig.control.gene.count.uniq <- unique(olig.control.gene.count[,c(12,13,22)])
olig.opioid.gene.count <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
olig.opioid.gene.count.uniq <- unique(olig.opioid.gene.count[,c(12,13,22)])
olig.count <- inner_join(olig.control.gene.count.uniq, olig.opioid.gene.count.uniq, by="gid")
olig.summary <- olig.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, peak.diff.percent = ((control.peak.count-opioid.peak.count)/control.peak.count)*100, log2fc = log2(olig.276/olig.302))
glu.df <- glu.summary[,c(1,6,7)]
gaba.df <- gaba.summary[,c(1,6,7)]
olig.df <- olig.summary[,c(1,6,8)]
glu.df$cell_type <- "GLU"
gaba.df$cell_type <- "GABA"
olig.df$cell_type <- "OLIG"
all.df <- rbind(glu.df, gaba.df, olig.df)
library(ggplot2)
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.2kb.pdf")
# ggplot(all.df) + geom_point(aes(x=peak.diff, y=log2fc, color=cell_type)) + facet_grid(cell_type ~ .) + theme_classic()
# dev.off()
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.df, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.df, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.df, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.2kb.DE.pdf")
# ggplot(all.de.df) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
# dev.off()
#
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.2kb.DE.padj.pdf")
# ggplot(all.de.df) + geom_point(aes(x=peak.diff, y=log2fc, color=DE.sig, alpha=0.5, size = padj)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
# dev.off()
#
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.2kb.DE.boxplot.pdf")
# ggplot(all.de.df) + geom_boxplot(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
# dev.off()
#
# pdf("peak.exp.percent.2kb.DE.boxplot.pdf")
# ggplot(all.de.df) + geom_boxplot(aes(x=DE.sig, y=peak.diff.percent, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
# dev.off()
all.de.df.na <- na.omit(all.de.df[,1:4])
cor(all.de.df.na$peak.diff, all.de.df.na$log2fc)
# 0.002569402
all.de.df.na.olig <- subset(all.de.df.na, all.de.df.na$cell_type == "OLIG")
cor(all.de.df.na.olig$peak.diff, all.de.df.na.olig$log2fc)
# 0.004306233
ggplot(na.omit(all.de.df)) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(na.omit(all.de.df)) + geom_boxplot(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
### look at just association between peak number and expression (test on control olig)
cor(na.omit(olig.summary$olig.276), na.omit(olig.summary$control.peak.count))
# 0.3348429
ggplot(na.omit(olig.summary)) + geom_point(aes(x=olig.276, y=control.peak.count)) + theme_classic()
cor(na.omit(olig.summary$olig.302), na.omit(olig.summary$opioid.peak.count))
# 0.3152536
cor(na.omit(gaba.summary$gaba.276), na.omit(gaba.summary$control.peak.count))
# 0.3200285
cor(na.omit(gaba.summary$gaba.302), na.omit(gaba.summary$opioid.peak.count))
# 0.3849514
cor(olig.df.de$peak.diff, olig.df.de$log2fc)
### use signal values (narrowpeak output file column 7) instead of peak count...
olig.control.gene.value <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.value = mean(V7))
olig.control.gene.value.uniq <- unique(olig.control.gene.value[,c(12,13,22)])
olig.opioid.gene.value <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.value = mean(V7))
olig.opioid.gene.value.uniq <- unique(olig.opioid.gene.value[,c(12,13,22)])
olig.value <- full_join(olig.control.gene.value.uniq, olig.opioid.gene.value.uniq, by="gid")
olig.value.summary <- olig.value %>% group_by(gid) %>% mutate(peak.log2fc = log2(control.peak.value/opioid.peak.value), log2fc = log2(olig.276/olig.302))
cor(na.omit(olig.value.summary$olig.276), na.omit(olig.value.summary$control.peak.value))
# 0.2265158
ggplot(olig.value.summary) + geom_point(aes(x=olig.276, y=control.peak.value)) + theme_classic()
olig.value.summary.na <- na.omit(olig.value.summary)
cor(olig.value.summary.na$log2fc, olig.value.summary.na$peak.log2fc)
# -0.01985823
ggplot(na.omit(olig.value.summary)) + geom_point(aes(x=log2fc, y=peak.log2fc)) + theme_classic()
##### try 10kb regions
# glu
colnames(glu.TMM.control) <- c("gid", "glu.276")
colnames(glu.TMM.opioid) <- c("gid", "glu.302")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.id, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.id, by="gid")
glu.atac.control$chr <- glu.atac.control$V1
glu.atac.control$start <- glu.atac.control$V2 - 10000
glu.atac.control$end <- glu.atac.control$V3 + 10000
glu.atac.control.gene <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("chr", "start", "end"))
glu.atac.opioid$chr <- glu.atac.opioid$V1
glu.atac.opioid$start <- glu.atac.opioid$V2 - 10000
glu.atac.opioid$end <- glu.atac.opioid$V3 + 10000
glu.atac.opioid.gene <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("chr", "start", "end"))
glu.control.gene.count <- glu.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.gene.count.uniq <- unique(glu.control.gene.count[,c(12,13,22)])
glu.opioid.gene.count <- glu.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.gene.count.uniq <- unique(glu.opioid.gene.count[,c(12,13,22)])
glu.count <- inner_join(glu.control.gene.count.uniq, glu.opioid.gene.count.uniq, by="gid")
glu.summary <- glu.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
# gaba
colnames(gaba.TMM.control) <- c("gid", "gaba.276")
colnames(gaba.TMM.opioid) <- c("gid", "gaba.302")
gaba.TMM.control.gid <- inner_join(gaba.TMM.control, gene.id, by="gid")
gaba.TMM.opioid.gid <- inner_join(gaba.TMM.opioid, gene.id, by="gid")
gaba.atac.control$chr <- gaba.atac.control$V1
gaba.atac.control$start <- gaba.atac.control$V2 - 10000
gaba.atac.control$end <- gaba.atac.control$V3 + 10000
gaba.atac.control.gene <- genome_intersect(gaba.atac.control, gaba.TMM.control.gid, by=c("chr", "start", "end"))
gaba.atac.opioid$chr <- gaba.atac.opioid$V1
gaba.atac.opioid$start <- gaba.atac.opioid$V2 - 10000
gaba.atac.opioid$end <- gaba.atac.opioid$V3 + 10000
gaba.atac.opioid.gene <- genome_intersect(gaba.atac.opioid, gaba.TMM.opioid.gid, by=c("chr", "start", "end"))
gaba.control.gene.count <- gaba.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
gaba.control.gene.count.uniq <- unique(gaba.control.gene.count[,c(12,13,22)])
gaba.opioid.gene.count <- gaba.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
gaba.opioid.gene.count.uniq <- unique(gaba.opioid.gene.count[,c(12,13,22)])
gaba.count <- inner_join(gaba.control.gene.count.uniq, gaba.opioid.gene.count.uniq, by="gid")
gaba.summary <- gaba.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(gaba.276/gaba.302))
# olig
colnames(olig.TMM.control) <- c("gid", "olig.276")
colnames(olig.TMM.opioid) <- c("gid", "olig.302")
olig.TMM.control.gid <- inner_join(olig.TMM.control, gene.id, by="gid")
olig.TMM.opioid.gid <- inner_join(olig.TMM.opioid, gene.id, by="gid")
olig.atac.control$chr <- olig.atac.control$V1
olig.atac.control$start <- olig.atac.control$V2 - 10000
olig.atac.control$end <- olig.atac.control$V3 + 10000
olig.atac.control.gene <- genome_intersect(olig.atac.control, olig.TMM.control.gid, by=c("chr", "start", "end"))
olig.atac.opioid$chr <- olig.atac.opioid$V1
olig.atac.opioid$start <- olig.atac.opioid$V2 - 10000
olig.atac.opioid$end <- olig.atac.opioid$V3 + 10000
olig.atac.opioid.gene <- genome_intersect(olig.atac.opioid, olig.TMM.opioid.gid, by=c("chr", "start", "end"))
olig.control.gene.count <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
olig.control.gene.count.uniq <- unique(olig.control.gene.count[,c(12,13,22)])
olig.opioid.gene.count <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
olig.opioid.gene.count.uniq <- unique(olig.opioid.gene.count[,c(12,13,22)])
olig.count <- inner_join(olig.control.gene.count.uniq, olig.opioid.gene.count.uniq, by="gid")
olig.summary <- olig.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(olig.276/olig.302))
glu.df <- glu.summary[,c(1,6,7)]
gaba.df <- gaba.summary[,c(1,6,7)]
olig.df <- olig.summary[,c(1,6,7)]
glu.df$cell_type <- "GLU"
gaba.df$cell_type <- "GABA"
olig.df$cell_type <- "OLIG"
all.df <- rbind(glu.df, gaba.df, olig.df)
library(ggplot2)
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.2kb.pdf")
# ggplot(all.df) + geom_point(aes(x=peak.diff, y=log2fc, color=cell_type)) + facet_grid(cell_type ~ .) + theme_classic()
# dev.off()
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.df, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.df, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.df, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.10kb.DE.pdf")
# ggplot(all.de.df) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
# dev.off()
#
# setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/")
# pdf("peak.exp.10kb.DE.padj.pdf")
# ggplot(all.de.df) + geom_point(aes(x=peak.diff, y=log2fc, color=DE.sig, alpha=0.5, size = padj)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
# dev.off()
ggplot(na.omit(all.de.df)) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(na.omit(all.de.df)) + geom_boxplot(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
### look at just association between peak number and expression (test on control olig)
cor(na.omit(olig.summary$olig.276), na.omit(olig.summary$control.peak.count))
# 0.3616354
ggplot(na.omit(olig.summary)) + geom_point(aes(x=olig.276, y=control.peak.count)) + theme_classic()
cor(na.omit(olig.summary$olig.302), na.omit(olig.summary$opioid.peak.count))
# 0.3152536
cor(na.omit(gaba.summary$gaba.276), na.omit(gaba.summary$control.peak.count))
# 0.3431861
cor(na.omit(gaba.summary$gaba.302), na.omit(gaba.summary$opioid.peak.count))
# 0.3849514
### use signal values (narrowpeak output file column 7) instead of peak count...
olig.control.gene.value <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.value = mean(V7))
olig.control.gene.value.uniq <- unique(olig.control.gene.value[,c(12,13,22)])
olig.opioid.gene.value <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.value = mean(V7))
olig.opioid.gene.value.uniq <- unique(olig.opioid.gene.value[,c(12,13,22)])
olig.value <- full_join(olig.control.gene.value.uniq, olig.opioid.gene.value.uniq, by="gid")
olig.value.summary <- olig.value %>% group_by(gid) %>% mutate(peak.log2fc = log2(control.peak.value/opioid.peak.value), log2fc = log2(olig.276/olig.302))
cor(na.omit(olig.value.summary$olig.276), na.omit(olig.value.summary$control.peak.value))
# 0.2180869
ggplot(na.omit(olig.value.summary)) + geom_point(aes(x=olig.276, y=control.peak.value)) + theme_classic()
olig.value.summary.na <- na.omit(olig.value.summary)
cor(olig.value.summary.na$log2fc, olig.value.summary.na$peak.log2fc)
# -0.01571316
ggplot(na.omit(olig.value.summary)) + geom_point(aes(x=log2fc, y=peak.log2fc)) + theme_classic()
# what if we on categorizing... what is the breakdown of atac signal and expression values
summary(all.de.df$log2fc)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -9.028 -0.156 0.000 -0.012 0.154 9.833 19677
summary(all.de.df$peak.diff)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -46.000 0.000 1.000 1.803 3.000 77.000 15841
summary(olig.value.summary$log2fc)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -9.028 -0.215 0.000 0.003 0.205 8.954 8418
summary(olig.value.summary$peak.log2fc)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -2.164 -0.599 -0.309 -0.313 -0.042 3.394 6771
# all.de.df = 93895
all.de.df.peakdiff5 <- subset(all.de.df, abs(all.de.df$peak.diff) > 5)
# 7389
all.de.df.peakdiff5.count <- all.de.df.peakdiff5 %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
peakdiff5.count <- unique(all.de.df.peakdiff5.count[,c(4,6,9)])
# cell_type DE.sig count
# <chr> <chr> <int>
# 1 GLU NO 2184
# 2 GLU NA 291
# 3 GLU DOWN 2
# 4 GLU UP 7 (0.3%)
# 5 GABA NO 2624
# 6 GABA NA 586
# 7 GABA DOWN 18
# 8 GABA UP 18 (0.55%)
# 9 OLIG NO 1441
# 10 OLIG NA 214
# 11 OLIG UP 3 (0.18%)
# 12 OLIG DOWN 1
all.de.df.peakdiff10 <- subset(all.de.df, abs(all.de.df$peak.diff) > 10)
# 1978
all.de.df.peakdiff10.count <- all.de.df.peakdiff10 %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
peakdiff10.count <- unique(all.de.df.peakdiff10.count[,c(4,6,9)])
# cell_type DE.sig count
# <chr> <chr> <int>
# 1 GLU NO 757
# 2 GLU NA 45
# 3 GLU UP 2
# 4 GABA NO 741
# 5 GABA NA 136
# 6 GABA UP 6
# 7 GABA DOWN 6
# 8 OLIG NO 262
# 9 OLIG NA 23
all.de.df.peakdiffno <- subset(all.de.df, abs(all.de.df$peak.diff) <= 5)
# 70665
all.de.df.peakdiffno.count <- all.de.df.peakdiffno %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
peakdiffno.count <- unique(all.de.df.peakdiffno.count[,c(4,6,9)])
# cell_type DE.sig count
# <chr> <chr> <int>
# 1 GLU NA 6956
# 2 GLU NO 19052
# 3 GLU DOWN 122
# 4 GLU UP 60 (0.23%)
# 5 GABA NA 5359
# 6 GABA NO 15723
# 7 GABA DOWN 173
# 8 GABA UP 92 (0.43%)
# 9 OLIG NA 6780
# 10 OLIG NO 16209
# 11 OLIG UP 122
# 12 OLIG DOWN 17 (0.53%)
# look at just the DE genes???
all.DE <- subset(all.de.df, all.de.df$DE.sig == "UP" | all.de.df$DE.sig == "DOWN")
# 740
summary(all.DE$peak.diff)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -4.000 0.000 2.000 2.117 3.000 61.000 105
summary(all.DE$log2fc)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -6.0024 -0.6920 -0.1001 -0.1394 0.3452 7.5251 209
all.nonDE <- subset(all.de.df, all.de.df$DE.sig != "UP" & all.de.df$DE.sig != "DOWN")
# 61991
summary(all.nonDE$peak.diff)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -46.000 0.000 1.000 1.899 3.000 77.000 4758
summary(all.nonDE$log2fc)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# -8.978 -0.134 -0.006 0.008 0.144 9.833 8399
all.DE$type <- "DE"
all.nonDE$type <- "nonDE"
all <- rbind(all.DE, all.nonDE)
ggplot(na.omit(all)) + geom_violin(aes(x=type, y=peak.diff)) + theme_classic()
ggplot(na.omit(all)) + geom_violin(aes(x=type, y=log2fc)) + theme_classic()
–> run with sig > 5 macs peaks and pval > 10
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/atac.peaks")
glu.atac.control <- read.delim("GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
glu.atac.opioid <- read.delim("GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
gaba.atac.control <- read.delim("GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
gaba.atac.opioid <- read.delim("GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
olig.atac.control <- read.delim("OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
olig.atac.opioid <- read.delim("OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
# 220004, 154360, 198303, 134045, 127091, 120009
glu.atac.control <- subset(glu.atac.control, glu.atac.control$V7 > 5 & glu.atac.control$V8 > 10)
glu.atac.opioid <- subset(glu.atac.opioid, glu.atac.opioid$V7 > 5 & glu.atac.opioid$V8 > 10)
gaba.atac.control <- subset(gaba.atac.control, gaba.atac.control$V7 > 5 & gaba.atac.control$V8 > 10)
gaba.atac.opioid <- subset(gaba.atac.opioid, gaba.atac.opioid$V7 > 5 & gaba.atac.opioid$V8 > 10)
olig.atac.control <- subset(olig.atac.control, olig.atac.control$V7 > 5 & olig.atac.control$V8 > 10)
olig.atac.opioid <- subset(olig.atac.opioid, olig.atac.opioid$V7 > 5 & olig.atac.opioid$V8 > 10)
# 39544, 45727, 22315, 31102, 32137, 43059
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
glu.DE <- read.delim("Glu_intersection.csv", header=T, sep=",")
gaba.DE <- read.delim("Gabba_intersection.csv", header=T, sep=",")
olig.DE <- read.delim("Olig_intersection.csv", header=T, sep=",")
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
gaba.TMM <- read.delim("Gabba_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM <- read.delim("Glu_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
olig.TMM <- read.delim("Olig_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM.control <- glu.TMM[,c(1,6)]
glu.TMM.opioid <- glu.TMM[,c(1,9)]
gaba.TMM.control <- gaba.TMM[,c(1,6)]
gaba.TMM.opioid <- gaba.TMM[,c(1,9)]
olig.TMM.control <- olig.TMM[,c(1,6)]
olig.TMM.opioid <- olig.TMM[,c(1,9)]
# glu
colnames(glu.TMM.control) <- c("gid", "glu.276")
colnames(glu.TMM.opioid) <- c("gid", "glu.302")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.id, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.id, by="gid")
glu.atac.control$chr <- glu.atac.control$V1
glu.atac.control$start <- glu.atac.control$V2 - 2000
glu.atac.control$end <- glu.atac.control$V3 + 2000
glu.atac.control.gene <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("chr", "start", "end"))
glu.atac.opioid$chr <- glu.atac.opioid$V1
glu.atac.opioid$start <- glu.atac.opioid$V2 - 2000
glu.atac.opioid$end <- glu.atac.opioid$V3 + 2000
glu.atac.opioid.gene <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("chr", "start", "end"))
glu.control.gene.count <- glu.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.gene.count.uniq <- unique(glu.control.gene.count[,c(12,13,22)])
glu.opioid.gene.count <- glu.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.gene.count.uniq <- unique(glu.opioid.gene.count[,c(12,13,22)])
glu.count <- inner_join(glu.control.gene.count.uniq, glu.opioid.gene.count.uniq, by="gid")
glu.summary <- glu.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
# gaba
colnames(gaba.TMM.control) <- c("gid", "gaba.276")
colnames(gaba.TMM.opioid) <- c("gid", "gaba.302")
gaba.TMM.control.gid <- inner_join(gaba.TMM.control, gene.id, by="gid")
gaba.TMM.opioid.gid <- inner_join(gaba.TMM.opioid, gene.id, by="gid")
gaba.atac.control$chr <- gaba.atac.control$V1
gaba.atac.control$start <- gaba.atac.control$V2 - 2000
gaba.atac.control$end <- gaba.atac.control$V3 + 2000
gaba.atac.control.gene <- genome_intersect(gaba.atac.control, gaba.TMM.control.gid, by=c("chr", "start", "end"))
gaba.atac.opioid$chr <- gaba.atac.opioid$V1
gaba.atac.opioid$start <- gaba.atac.opioid$V2 - 2000
gaba.atac.opioid$end <- gaba.atac.opioid$V3 + 2000
gaba.atac.opioid.gene <- genome_intersect(gaba.atac.opioid, gaba.TMM.opioid.gid, by=c("chr", "start", "end"))
gaba.control.gene.count <- gaba.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
gaba.control.gene.count.uniq <- unique(gaba.control.gene.count[,c(12,13,22)])
gaba.opioid.gene.count <- gaba.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
gaba.opioid.gene.count.uniq <- unique(gaba.opioid.gene.count[,c(12,13,22)])
gaba.count <- inner_join(gaba.control.gene.count.uniq, gaba.opioid.gene.count.uniq, by="gid")
gaba.summary <- gaba.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(gaba.276/gaba.302))
# olig
colnames(olig.TMM.control) <- c("gid", "olig.276")
colnames(olig.TMM.opioid) <- c("gid", "olig.302")
olig.TMM.control.gid <- inner_join(olig.TMM.control, gene.id, by="gid")
olig.TMM.opioid.gid <- inner_join(olig.TMM.opioid, gene.id, by="gid")
olig.atac.control$chr <- olig.atac.control$V1
olig.atac.control$start <- olig.atac.control$V2 - 2000
olig.atac.control$end <- olig.atac.control$V3 + 2000
olig.atac.control.gene <- genome_intersect(olig.atac.control, olig.TMM.control.gid, by=c("chr", "start", "end"))
olig.atac.opioid$chr <- olig.atac.opioid$V1
olig.atac.opioid$start <- olig.atac.opioid$V2 - 2000
olig.atac.opioid$end <- olig.atac.opioid$V3 + 2000
olig.atac.opioid.gene <- genome_intersect(olig.atac.opioid, olig.TMM.opioid.gid, by=c("chr", "start", "end"))
olig.control.gene.count <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
olig.control.gene.count.uniq <- unique(olig.control.gene.count[,c(12,13,22)])
olig.opioid.gene.count <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
olig.opioid.gene.count.uniq <- unique(olig.opioid.gene.count[,c(12,13,22)])
olig.count <- inner_join(olig.control.gene.count.uniq, olig.opioid.gene.count.uniq, by="gid")
olig.summary <- olig.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, peak.diff.percent = ((control.peak.count-opioid.peak.count)/control.peak.count)*100, log2fc = log2(olig.276/olig.302))
glu.df <- glu.summary[,c(1,6,7)]
gaba.df <- gaba.summary[,c(1,6,7)]
olig.df <- olig.summary[,c(1,6,8)]
glu.df$cell_type <- "GLU"
gaba.df$cell_type <- "GABA"
olig.df$cell_type <- "OLIG"
all.df <- rbind(glu.df, gaba.df, olig.df)
library(ggplot2)
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.df, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.df, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.df, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
all.de.df.na <- na.omit(all.de.df[,1:4])
cor(all.de.df.na$peak.diff, all.de.df.na$log2fc)
# 0.001280673
all.de.df.na.olig <- subset(all.de.df.na, all.de.df.na$cell_type == "OLIG")
cor(all.de.df.na.olig$peak.diff, all.de.df.na.olig$log2fc)
# 0.008228202
ggplot(na.omit(all.de.df)) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(na.omit(all.de.df)) + geom_violin(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
### look at just association between peak number and expression (test on control olig)
cor(na.omit(olig.summary$olig.276), na.omit(olig.summary$control.peak.count))
# 0.2880334
ggplot(olig.summary) + geom_point(aes(x=olig.276, y=control.peak.count)) + theme_classic()
cor(na.omit(olig.summary$olig.302), na.omit(olig.summary$opioid.peak.count))
# 0.3250295
cor(na.omit(gaba.summary$gaba.276), na.omit(gaba.summary$control.peak.count))
# 0.2472245
cor(na.omit(gaba.summary$gaba.302), na.omit(gaba.summary$opioid.peak.count))
# 0.2721847
### use signal values (narrowpeak output file column 7) instead of peak count...
olig.control.gene.value <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.value = mean(V7))
olig.control.gene.value.uniq <- unique(olig.control.gene.value[,c(12,13,22)])
olig.opioid.gene.value <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.value = mean(V7))
olig.opioid.gene.value.uniq <- unique(olig.opioid.gene.value[,c(12,13,22)])
olig.value <- full_join(olig.control.gene.value.uniq, olig.opioid.gene.value.uniq, by="gid")
olig.value.summary <- olig.value %>% group_by(gid) %>% mutate(peak.log2fc = log2(control.peak.value/opioid.peak.value), log2fc = log2(olig.276/olig.302))
cor(na.omit(olig.value.summary$olig.276), na.omit(olig.value.summary$control.peak.value))
# 0.1349466
olig.value.summary.na <- na.omit(olig.value.summary)
cor(olig.value.summary.na$log2fc, olig.value.summary.na$peak.log2fc)
# -0.01085165
##### try 10kb regions
# glu
colnames(glu.TMM.control) <- c("gid", "control.TMM")
colnames(glu.TMM.opioid) <- c("gid", "opioid.TMM")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.id, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.id, by="gid")
glu.atac.control$chr <- glu.atac.control$V1
glu.atac.control$start <- glu.atac.control$V2 - 10000
glu.atac.control$end <- glu.atac.control$V3 + 10000
glu.atac.control.gene <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("chr", "start", "end"))
glu.atac.opioid$chr <- glu.atac.opioid$V1
glu.atac.opioid$start <- glu.atac.opioid$V2 - 10000
glu.atac.opioid$end <- glu.atac.opioid$V3 + 10000
glu.atac.opioid.gene <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("chr", "start", "end"))
glu.control.gene.count <- glu.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.gene.count.uniq <- unique(glu.control.gene.count[,c(12,13,22)])
glu.opioid.gene.count <- glu.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.gene.count.uniq <- unique(glu.opioid.gene.count[,c(12,13,22)])
glu.count <- inner_join(glu.control.gene.count.uniq, glu.opioid.gene.count.uniq, by="gid")
glu.summary <- glu.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(control.TMM/opioid.TMM))
# gaba
colnames(gaba.TMM.control) <- c("gid", "control.TMM")
colnames(gaba.TMM.opioid) <- c("gid", "opioid.TMM")
gaba.TMM.control.gid <- inner_join(gaba.TMM.control, gene.id, by="gid")
gaba.TMM.opioid.gid <- inner_join(gaba.TMM.opioid, gene.id, by="gid")
gaba.atac.control$chr <- gaba.atac.control$V1
gaba.atac.control$start <- gaba.atac.control$V2 - 10000
gaba.atac.control$end <- gaba.atac.control$V3 + 10000
gaba.atac.control.gene <- genome_intersect(gaba.atac.control, gaba.TMM.control.gid, by=c("chr", "start", "end"))
gaba.atac.opioid$chr <- gaba.atac.opioid$V1
gaba.atac.opioid$start <- gaba.atac.opioid$V2 - 10000
gaba.atac.opioid$end <- gaba.atac.opioid$V3 + 10000
gaba.atac.opioid.gene <- genome_intersect(gaba.atac.opioid, gaba.TMM.opioid.gid, by=c("chr", "start", "end"))
gaba.control.gene.count <- gaba.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
gaba.control.gene.count.uniq <- unique(gaba.control.gene.count[,c(12,13,22)])
gaba.opioid.gene.count <- gaba.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
gaba.opioid.gene.count.uniq <- unique(gaba.opioid.gene.count[,c(12,13,22)])
gaba.count <- inner_join(gaba.control.gene.count.uniq, gaba.opioid.gene.count.uniq, by="gid")
gaba.summary <- gaba.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(control.TMM/opioid.TMM))
# olig
colnames(olig.TMM.control) <- c("gid", "control.TMM")
colnames(olig.TMM.opioid) <- c("gid", "opioid.TMM")
olig.TMM.control.gid <- inner_join(olig.TMM.control, gene.id, by="gid")
olig.TMM.opioid.gid <- inner_join(olig.TMM.opioid, gene.id, by="gid")
olig.atac.control$chr <- olig.atac.control$V1
olig.atac.control$start <- olig.atac.control$V2 - 10000
olig.atac.control$end <- olig.atac.control$V3 + 10000
olig.atac.control.gene <- genome_intersect(olig.atac.control, olig.TMM.control.gid, by=c("chr", "start", "end"))
olig.atac.opioid$chr <- olig.atac.opioid$V1
olig.atac.opioid$start <- olig.atac.opioid$V2 - 10000
olig.atac.opioid$end <- olig.atac.opioid$V3 + 10000
olig.atac.opioid.gene <- genome_intersect(olig.atac.opioid, olig.TMM.opioid.gid, by=c("chr", "start", "end"))
olig.control.gene.count <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
olig.control.gene.count.uniq <- unique(olig.control.gene.count[,c(12,13,22)])
olig.opioid.gene.count <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
olig.opioid.gene.count.uniq <- unique(olig.opioid.gene.count[,c(12,13,22)])
olig.count <- inner_join(olig.control.gene.count.uniq, olig.opioid.gene.count.uniq, by="gid")
olig.summary <- olig.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(control.TMM/opioid.TMM))
glu.df <- glu.summary[,c(1,6,7)]
gaba.df <- gaba.summary[,c(1,6,7)]
olig.df <- olig.summary[,c(1,6,7)]
glu.df$cell_type <- "GLU"
gaba.df$cell_type <- "GABA"
olig.df$cell_type <- "OLIG"
all.df <- rbind(glu.df, gaba.df, olig.df)
library(ggplot2)
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.df, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.df, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.df, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
ggplot(na.omit(all.de.df)) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(na.omit(all.de.df)) + geom_violin(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
### look at just association between peak number and expression (test on control olig)
cor(na.omit(olig.summary$control.TMM), na.omit(olig.summary$control.peak.count))
# 0.3259667
ggplot(olig.summary) + geom_point(aes(x=control.TMM, y=control.peak.count)) + theme_classic()
cor(na.omit(olig.summary$opioid.TMM), na.omit(olig.summary$opioid.peak.count))
# 0.3618497
cor(na.omit(gaba.summary$control.TMM), na.omit(gaba.summary$control.peak.count))
# 0.2748574
cor(na.omit(gaba.summary$opioid.TMM), na.omit(gaba.summary$opioid.peak.count))
# 0.3055798
### use signal values (narrowpeak output file column 7) instead of peak count...
olig.control.gene.value <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.value = mean(V7))
olig.control.gene.value.uniq <- unique(olig.control.gene.value[,c(12,13,22)])
olig.opioid.gene.value <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.value = mean(V7))
olig.opioid.gene.value.uniq <- unique(olig.opioid.gene.value[,c(12,13,22)])
olig.value <- full_join(olig.control.gene.value.uniq, olig.opioid.gene.value.uniq, by="gid")
olig.value.summary <- olig.value %>% group_by(gid) %>% mutate(peak.log2fc = log2(control.peak.value/opioid.peak.value), log2fc = log2(control.TMM/opioid.TMM))
cor(na.omit(olig.value.summary$control.TMM), na.omit(olig.value.summary$control.peak.value))
# 0.1452439
ggplot(olig.value.summary) + geom_point(aes(x=control.TMM, y=control.peak.value)) + theme_classic()
olig.value.summary.na <- na.omit(olig.value.summary)
cor(olig.value.summary.na$log2fc, olig.value.summary.na$peak.log2fc)
# -0.008547665
ggplot(olig.value.summary) + geom_point(aes(x=log2fc, y=peak.log2fc)) + theme_classic()
## look at general distributions of peaks
glu.summary$cell_type <- "GLU"
gaba.summary$cell_type <- "GABA"
olig.summary$cell_type <- "OLIG"
all.summary <- rbind(glu.summary, gaba.summary, olig.summary)
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.summary, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.summary, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.summary, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
library(ggplot2)
#library(ggpubr)
#ggplot(all.de.df, aes(x=peak.diff, y=log2FC)) + geom_point() + facet_grid(cell_type ~ DE.sig) + theme_classic() + stat_cor(method = "pearson", label.x = -20, label.y = 2)
ggplot(all.de.df, aes(x=peak.diff, y=log2FC)) + geom_point() + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(all.de.df) + geom_boxplot(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
all.de.na <- na.omit(all.de.df)
ggplot(all.de.na, aes(x=control.peak.count)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ cell_type) + theme_classic() + xlim(0,20)
ggplot(all.de.na, aes(x=opioid.peak.count)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ cell_type) + theme_classic() + xlim(0,20)
ggplot(all.de.na, aes(x=peak.diff)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ cell_type) + theme_classic()
ggplot(all.de.na, aes(x=peak.diff)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ cell_type) + theme_classic() + xlim(-2,2)
all.de.gr2 <- subset(all.de.na, all.de.na$control.peak.count >= 2 | all.de.na$opioid.peak.count >= 2)
ggplot(all.de.gr2, aes(x=peak.diff)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ cell_type) + theme_classic() + xlim(-4,4)
all.de.gr2.count <- all.de.gr2 %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
all.de.gr2.count.uniq <- unique(all.de.gr2.count[,c(8,10,13)])
# cell_type DE.sig count
# 1 GLU NO 11110 / 16116 = 0.689377 --> XX greater than 2x peak.diff
# 2 GLU DOWN 40 / 75 = 0.5333333
# 3 GLU UP 29 / 41 = 0.7073171
# 4 GABA NO 8310 / 13680 = 0.6074561
# 5 GABA DOWN 58 / 102 = 0.5686275
# 6 GABA UP 30 / 52 = 0.5769231
# 7 OLIG NO 10557 / 14997 = 0.7039408
# 8 OLIG UP 69 / 92 = 0.75
# 9 OLIG DOWN 13 / 20 = 0.65
all.de.count <- all.de.na %>% group_by(cell_type, DE.sig) %>% mutate(total.count = n())
all.de.count.uniq <- unique(all.de.count[,c(8,10,13)])
# cell_type DE.sig count
# 1 GLU NO 16116
# 2 GLU DOWN 75
# 3 GLU UP 41
# 4 GABA NO 13680
# 5 GABA DOWN 102
# 6 GABA UP 52
# 7 OLIG NO 14997
# 8 OLIG UP 92
# 9 OLIG DOWN 20
all.de.gr2.count.all <- left_join(all.de.gr2.count.uniq, all.de.count.uniq, by=c("cell_type", "DE.sig"))
all.de.gr2.count.all$prop.gr2 <- all.de.gr2.count.all$count / all.de.gr2.count.all$total.count
ggplot(all.de.gr2.count.all, aes(x=cell_type, y=prop.gr2, fill=DE.sig)) + geom_bar(stat="identity", position="dodge") + theme_classic()
all.de.gr2.count.ratio <- all.de.gr2.count %>% mutate(peak.log2fc = log2(control.peak.count / opioid.peak.count)) %>% mutate(peak.category = ifelse(peak.log2fc == 0, "no.change", ifelse(peak.log2fc > 1, "2x", ifelse(peak.log2fc < -1, "-2x", "mid"))))
ggplot(all.de.gr2.count.ratio, aes(x=peak.log2fc)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ cell_type) + theme_classic()
all.de.gr2.count.ratio2 <- all.de.gr2.count.ratio %>% group_by(cell_type, DE.sig, peak.category) %>% mutate(category.count = n(), category.prop = category.count / count)
all.de.gr2.count.ratio2.uniq <- unique(all.de.gr2.count.ratio2[,c(8,10,15,17)])
# cell_type DE.sig peak.category category.total
# 1 GLU NO mid 0.563
# 2 GLU NO no.change 0.378
# 3 GLU DOWN no.change 0.55
# 4 GLU NO 2x 0.0174
# 5 GLU NO -2x 0.0419
# 6 GLU UP no.change 0.345
# 7 GLU DOWN mid 0.45
# 8 GLU UP mid 0.517
# 9 GLU UP -2x 0.138
ggplot(all.de.gr2.count.ratio2.uniq, aes(x=DE.sig, y=category.prop, fill=peak.category)) + geom_bar(stat="identity") + theme_classic() + facet_grid(cell_type ~ .)
d <- subset(all.de.gr2.count.ratio2, all.de.gr2.count.ratio2$DE.sig == "UP")
dd <- subset(d, d$peak.category == "2x" | d$peak.category == "-2x")
dd[,c(1,8,10,11,14,15)]
# gid cell_type DE.sig log2FC peak.log2fc peak.category
# <chr> <chr> <chr> <dbl> <dbl> <chr>
# 1 FBLN2 GLU UP 0.745 -1.58 -2x
# 2 LOC101927189 GLU UP 0.982 -1.58 -2x
# 3 TJP3 GLU UP 0.748 -1.58 -2x
# 4 HNF4A GLU UP 0.648 -1.58 -2x
# 5 LOC102723409 GABA UP 0.516 1.58 2x
# 6 LOC105375492 GABA UP 0.651 -1.58 -2x
# 7 LOC105370491 GABA UP 1.15 -1.58 -2x
# 8 LOC107986454 OLIG UP 1.26 -2 -2x
# 9 PXDC1 OLIG UP 0.752 -2 -2x
# 10 IRF2BPL OLIG UP 0.782 -2 -2x
# NPAS4
# gid peak.diff cell_type DE.sig log2FC peak.log2fc peak.category
# <chr> <int> <chr> <chr> <dbl> <dbl> <chr>
# 1 NPAS4 0 GLU DOWN -1.54 0 no.change
# 2 NPAS4 0 GABA DOWN -1.32 0 no.change
# 3 NPAS4 0 OLIG NO -0.869 0 no.change
# gid peak.diff cell_type DE.sig log2FC peak.log2fc peak.category
# <chr> <int> <chr> <chr> <dbl> <dbl> <chr>
# 1 NR4A1 0 GLU DOWN -0.822 0 no.change
# 2 NR4A1 0 GABA DOWN -1.12 0 no.change
# 3 NR4A1 -3 OLIG NO 1.55 -0.678 mid
# On Summit:
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/bin/bedtools closest -a GCF_000001405.39_GRCh38.p13_genomic.gene.gtf -b GCF_000001405.39_GRCh38.p13_genomic.gene.gtf -io -id -D a > GCF_000001405.39_GRCh38.p13_genomic.gene.closest.up.gtf
# /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/bin/bedtools closest -a GCF_000001405.39_GRCh38.p13_genomic.gene.gtf -b GCF_000001405.39_GRCh38.p13_genomic.gene.gtf -io -iu -D a > GCF_000001405.39_GRCh38.p13_genomic.gene.closest.down.gtf
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.closest.up.gtf .
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.closest.down.gtf .
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
upstream <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.closest.up.gtf", header=F, sep="\t")
downstream <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.closest.down.gtf", header=F, sep="\t")
up.df <- upstream[,c(1,4,5,10,13,14)]
down.df <- downstream[,c(1,4,5,10,13,14)]
colnames(up.df) <- c("chr", "start", "end", "chr.up", "start.up", "end.up")
colnames(down.df) <- c("chr", "start", "end", "chr.down", "start.down", "end.down")
gene.up <- left_join(gene.id, up.df, by=c("chr", "start", "end"))
gene.up.down <- left_join(gene.up, down.df, by=c("chr", "start", "end"))
# 44958
## (+) start.up --> end.up --> start --> end --> start.down --> end.down
## (-) start.down <-- end.down <-- start <-- end <-- start.up <-- end.up
#gene.coord <- gene.up.down %>% mutate(genic.chr = chr, genic.start = start, genic.end = end, upstream.chr = chr.up, upstream.start = ifelse(strand == "+", end.up, end), upstream.end = ifelse(strand == "+", start, start.up), downstream.chr = chr.down, downstream.start = ifelse(strand == "+", end, end.down), downstream.end = ifelse(strand == "+", start.down, start))
#gene.coord <- gene.up.down %>% mutate(genic.chr = chr, genic.start = start, genic.end = end, promoter.chr = chr, promoter.start = ifelse(strand == "+" & end.up < start-2000, start-2000, ifelse(strand == "+", end.up+1, end)), promoter.end = ifelse(strand == "+", start, ifelse(strand == "-" & start.up > end+2000, end+2000, start.up-1))) %>% mutate(upstream.chr = chr.up, upstream.start = ifelse(strand == "+", end.up, promoter.end), upstream.end = ifelse(strand == "+", promoter.start, start.up), downstream.chr = chr.down, downstream.start = ifelse(strand == "+", end, end.down), downstream.end = ifelse(strand == "+", start.down, start))
gene.coord <- gene.up.down %>% mutate(genic.chr = chr, genic.start = start, genic.end = end, promoter.chr = chr, promoter.start = ifelse(start.up < start & end.up < start-2000, start-2000, ifelse(start.up < start, end.up+1, end)), promoter.end = ifelse(start.up < start, start, ifelse(start.up > start & start.up > end+2000, end+2000, start.up-1))) %>% mutate(upstream.chr = chr.up, upstream.start = ifelse(start.up < start, end.up, promoter.end), upstream.end = ifelse(start.up < start, promoter.start, start.up), downstream.chr = chr.down, downstream.start = ifelse(start.up < start, end, end.down), downstream.end = ifelse(start.up < start, start.down, start))
gene.df <- subset(gene.coord, gene.coord$upstream.chr != "." & gene.coord$downstream.chr != "." & gene.coord$upstream.start <= gene.coord$upstream.end & gene.coord$downstream.start <= gene.coord$downstream.end)
# 43915
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/atac.peaks")
glu.atac.control <- read.delim("GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
glu.atac.opioid <- read.delim("GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
gaba.atac.control <- read.delim("GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
gaba.atac.opioid <- read.delim("GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.narrowPeak", header=F, sep="\t")
olig.atac.control <- read.delim("OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
olig.atac.opioid <- read.delim("OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.macs2.narrowPeak", header=F, sep="\t")
glu.atac.control <- subset(glu.atac.control, glu.atac.control$V7 > 5 & glu.atac.control$V8 > 10)
glu.atac.opioid <- subset(glu.atac.opioid, glu.atac.opioid$V7 > 5 & glu.atac.opioid$V8 > 10)
gaba.atac.control <- subset(gaba.atac.control, gaba.atac.control$V7 > 5 & gaba.atac.control$V8 > 10)
gaba.atac.opioid <- subset(gaba.atac.opioid, gaba.atac.opioid$V7 > 5 & gaba.atac.opioid$V8 > 10)
olig.atac.control <- subset(olig.atac.control, olig.atac.control$V7 > 5 & olig.atac.control$V8 > 10)
olig.atac.opioid <- subset(olig.atac.opioid, olig.atac.opioid$V7 > 5 & olig.atac.opioid$V8 > 10)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
glu.DE <- read.delim("Glu_intersection.csv", header=T, sep=",")
gaba.DE <- read.delim("Gabba_intersection.csv", header=T, sep=",")
olig.DE <- read.delim("Olig_intersection.csv", header=T, sep=",")
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
gaba.TMM <- read.delim("Gabba_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM <- read.delim("Glu_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
olig.TMM <- read.delim("Olig_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM.control <- glu.TMM[,c(1,6)]
glu.TMM.opioid <- glu.TMM[,c(1,9)]
gaba.TMM.control <- gaba.TMM[,c(1,6)]
gaba.TMM.opioid <- gaba.TMM[,c(1,9)]
olig.TMM.control <- olig.TMM[,c(1,6)]
olig.TMM.opioid <- olig.TMM[,c(1,9)]
# glu
colnames(glu.TMM.control) <- c("gid", "glu.276")
colnames(glu.TMM.opioid) <- c("gid", "glu.302")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.df, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.df, by="gid")
colnames(glu.atac.control) <- c("genic.chr", "genic.start", "genic.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
colnames(glu.atac.opioid) <- c("genic.chr", "genic.start", "genic.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
glu.atac.control.genic <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("genic.chr", "genic.start", "genic.end"))
glu.atac.opioid.genic <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("genic.chr", "genic.start", "genic.end"))
colnames(glu.atac.control) <- c("upstream.chr", "upstream.start", "upstream.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
colnames(glu.atac.opioid) <- c("upstream.chr", "upstream.start", "upstream.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
glu.atac.control.upstream <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("upstream.chr", "upstream.start", "upstream.end"))
glu.atac.opioid.upstream <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("upstream.chr", "upstream.start", "upstream.end"))
colnames(glu.atac.control) <- c("downstream.chr", "downstream.start", "downstream.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
colnames(glu.atac.opioid) <- c("downstream.chr", "downstream.start", "downstream.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
glu.atac.control.downstream <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("downstream.chr", "downstream.start", "downstream.end"))
glu.atac.opioid.downstream <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("downstream.chr", "downstream.start", "downstream.end"))
colnames(glu.atac.control) <- c("promoter.chr", "promoter.start", "promoter.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
colnames(glu.atac.opioid) <- c("promoter.chr", "promoter.start", "promoter.end", "atac", "peak", "dot", "val1", "val2", "val3", "val4")
glu.atac.control.promoter <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("promoter.chr", "promoter.start", "promoter.end"))
glu.atac.opioid.promoter <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("promoter.chr", "promoter.start", "promoter.end"))
glu.control.genic.count <- glu.atac.control.genic %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.genic.count.uniq <- unique(glu.control.genic.count[,c(9,10,37)])
glu.opioid.genic.count <- glu.atac.opioid.genic %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.genic.count.uniq <- unique(glu.opioid.genic.count[,c(9,10,37)])
glu.genic.count <- inner_join(glu.control.genic.count.uniq, glu.opioid.genic.count.uniq, by="gid")
glu.genic.summary <- glu.genic.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
glu.control.upstream.count <- glu.atac.control.upstream %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.upstream.count.uniq <- unique(glu.control.upstream.count[,c(9,10,37)])
glu.opioid.upstream.count <- glu.atac.opioid.upstream %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.upstream.count.uniq <- unique(glu.opioid.upstream.count[,c(9,10,37)])
glu.upstream.count <- inner_join(glu.control.upstream.count.uniq, glu.opioid.upstream.count.uniq, by="gid")
glu.upstream.summary <- glu.upstream.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
glu.control.downstream.count <- glu.atac.control.downstream %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.downstream.count.uniq <- unique(glu.control.downstream.count[,c(9,10,37)])
glu.opioid.downstream.count <- glu.atac.opioid.downstream %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.downstream.count.uniq <- unique(glu.opioid.downstream.count[,c(9,10,37)])
glu.downstream.count <- inner_join(glu.control.downstream.count.uniq, glu.opioid.downstream.count.uniq, by="gid")
glu.downstream.summary <- glu.downstream.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
glu.control.promoter.count <- glu.atac.control.promoter %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.promoter.count.uniq <- unique(glu.control.promoter.count[,c(9,10,37)])
glu.opioid.promoter.count <- glu.atac.opioid.promoter %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.promoter.count.uniq <- unique(glu.opioid.promoter.count[,c(9,10,37)])
glu.promoter.count <- inner_join(glu.control.promoter.count.uniq, glu.opioid.promoter.count.uniq, by="gid")
glu.promoter.summary <- glu.promoter.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
glu.genic.df <- glu.genic.summary
glu.upstream.df <- glu.upstream.summary
glu.downstream.df <- glu.downstream.summary
glu.promoter.df <- glu.promoter.summary
# > nrow(glu.promoter.df)
# [1] 10258
# > nrow(glu.downstream.df)
# [1] 13092
# > nrow(glu.upstream.df)
# [1] 17697
# > nrow(glu.genic.df)
# [1] 15729
glu.genic.df$location <- "genic"
glu.upstream.df$location <- "upstream"
glu.downstream.df$location <- "downstream"
glu.promoter.df$location <- "promoter"
all.df <- rbind(glu.genic.df, glu.upstream.df, glu.downstream.df, glu.promoter.df)
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(all.df, glu.DE, by="gid")
glu.df.de.na <- na.omit(glu.df.de)
library(ggplot2)
ggplot(glu.df.de.na) + geom_violin(aes(x=DE.sig, y=log2FC, alpha=0.5)) + facet_grid(location ~ .) + theme_classic()
ggplot(glu.df.de.na) + geom_point(aes(x=peak.diff, y=log2FC, alpha=0.5)) + facet_grid(location ~ DE.sig) + theme_classic()
ggplot(glu.df.de.na) + geom_violin(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(location ~ .) + theme_classic()
de.gr2 <- subset(glu.df.de.na, glu.df.de.na$control.peak.count >= 2 | glu.df.de.na$opioid.peak.count >= 2)
ggplot(de.gr2, aes(x=peak.diff)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ location) + theme_classic() + xlim(-4,4)
de.gr2.count <- de.gr2 %>% group_by(location, DE.sig) %>% mutate(count = n())
de.gr2.count.uniq <- unique(de.gr2.count[,c(8,10,13)])
de.count <- glu.df.de.na %>% group_by(location, DE.sig) %>% mutate(total.count = n())
de.count.uniq <- unique(de.count[,c(8,10,13)])
de.gr2.count.all <- left_join(de.gr2.count.uniq, de.count.uniq, by=c("location", "DE.sig"))
de.gr2.count.all$prop.gr2 <- de.gr2.count.all$count / de.gr2.count.all$total.count
ggplot(de.gr2.count.all, aes(x=location, y=prop.gr2, fill=DE.sig)) + geom_bar(stat="identity", position="dodge") + theme_classic()
glu.df.de.class <- de.gr2.count %>% mutate(peak.log2fc = log2(control.peak.count / opioid.peak.count)) %>% mutate(peak.category = ifelse(peak.log2fc == 0, "no.change", ifelse(peak.log2fc > 1, "2x", ifelse(peak.log2fc < -1, "-2x", "mid"))))
ggplot(glu.df.de.class, aes(x=peak.log2fc)) + geom_density(aes(color=DE.sig)) + facet_grid(DE.sig ~ location) + theme_classic()
glu.df.de.class2 <- glu.df.de.class %>% group_by(location, DE.sig, peak.category) %>% mutate(category.count = n(), category.prop = category.count / count)
glu.df.de.class2.uniq <- unique(glu.df.de.class2[,c(8,10,15,17)])
ggplot(glu.df.de.class2.uniq, aes(x=DE.sig, y=category.prop, fill=peak.category)) + geom_bar(stat="identity") + theme_classic() + facet_grid(location ~ .)
conda create -n 3d-dna
source activate 3d-dna
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
git clone https://github.com/lastz/lastz.git
cd lastz/
cd src/
make
make install
make test
pip install numpy --user
pip install scipy --user
pip install matplotlib --user
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
git clone https://github.com/theaidenlab/3d-dna.git
PATH=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/lastz
PATH=/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/3d-dna
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
source activate 3d-dna
module load jdk
module load parallel
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/3d-dna
bash run-asm-pipeline.sh --editor-repeat-coverage 20 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer.4reps/aligned/merged_nodups.txt
https://bioconductor.org/packages/devel/bioc/vignettes/ATACseqQC/inst/doc/ATACseqQC.html#Quick_start –> can’t get the necessary packages downloaded with R version… got it to work on Summit BUTTT… the annotation version files don’t match… different chrom names
# salloc -A SYB105 -N 2 -t 4:00:00
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam
for bam in $(ls *.bam); do /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools index $bam; done
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# need R version 4.1 (https://www.biostars.org/p/498049/)
library(BiocManager)
BiocManager::install(c("ATACseqQC", "ChIPpeakAnno", "MotifDb", "GenomicAlignments",
"BSgenome.Hsapiens.UCSC.hg19", "TxDb.Hsapiens.UCSC.hg19.knownGene", "TxDb.Hsapiens.UCSC.hg38.knownGene",
"phastCons100way.UCSC.hg19"))
## load the library
library(ATACseqQC)
## input the bamFile from the ATACseqQC package
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam")
#bamfile <- system.file("extdata", "file.bam", package="ATACseqQC", mustWork=TRUE)
bamfile <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.rmdups.bam"
library(Rsamtools)
#bamfile <- open(BamFile("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.rmdups.bam"))
bamfile.labels <- gsub(".bam", "", basename(bamfile))
## estimate library complexity
estimateLibComplexity(readsDupFreq(bamfile))
## fragment size distribution
fragSize <- fragSizeDist(bamfile, bamfile.labels)
## nucleosome positioning
## bamfile tags to be read in
possibleTag <- list("integer"=c("AM", "AS", "CM", "CP", "FI", "H0", "H1", "H2",
"HI", "IH", "MQ", "NH", "NM", "OP", "PQ", "SM",
"TC", "UQ"),
"character"=c("BC", "BQ", "BZ", "CB", "CC", "CO", "CQ", "CR",
"CS", "CT", "CY", "E2", "FS", "LB", "MC", "MD",
"MI", "OA", "OC", "OQ", "OX", "PG", "PT", "PU",
"Q2", "QT", "QX", "R2", "RG", "RX", "SA", "TS",
"U2"))
bamTop100 <- scanBam(BamFile(bamfile, yieldSize = 100),param = ScanBamParam(tag=unlist(possibleTag)))[[1]]$tag
tags <- names(bamTop100)[lengths(bamTop100)>0]
tags
# integer2 integer13 character15 character16 character22 character31
# "AS" "NM" "MC" "MD" "PG" "SA"
## files will be output into outPath
outPath <- "splited"
dir.create(outPath)
## shift the coordinates of 5'ends of alignments in the bam file
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
## if you don't have an available TxDb, please refer
## GenomicFeatures::makeTxDbFromGFF to create one from gff3 or gtf file.
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
# need to rename chromosomes to match NCBI annotation
ncbi.chr <- data.frame(ucsc = c("chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chrX", "chrY"), ncbi = c("NC_000010.11", "NC_000011.10", "NC_000012.12", "NC_000013.11", "NC_000014.9", "NC_000015.10", "NC_000016.10", "NC_000017.11", "NC_000018.10", "NC_000019.10", "NC_000020.11", "NC_000021.9", "NC_000022.11", "NC_000001.11", "NC_000002.12", "NC_000003.12", "NC_000004.12", "NC_000005.10", "NC_000006.12", "NC_000007.14", "NC_000008.11", "NC_000009.12", "NC_000023.11", "NC_000024.10"))
#seqinformation <- seqinfo(TxDb.Hsapiens.UCSC.hg38.knownGene)
seqinformation <- Seqinfo(genome="hg38")
library(dplyr)
hg38 <- data.frame(y = 1:595)
hg38$seqnames <- seqnames(seqinformation) %>% as.character
hg38$seqlengths <- seqlengths(seqinformation) %>% as.numeric
hg38$isCircular <- isCircular(seqinformation) %>% as.logical
hg38$genome <- genome(seqinformation) %>% as.character
head(hg38)
hg38$seqnames[match(ncbi.chr$ucsc, hg38$seqnames)] <- ncbi.chr$ncbi
seqinformation.hg38 <- Seqinfo(seqnames=hg38$seqnames, seqlengths=hg38$seqlengths, isCircular=hg38$isCircular, genome="hg38")
seqlev <- "NC_000020.11" ## subsample data for quick run
which <- as(seqinformation.hg38[seqlev], "GRanges")
gal <- readBamFile(bamfile, tag=tags, which=which, asMates=TRUE, bigFile=TRUE)
shiftedBamfile <- file.path(outPath, "shifted.bam")
gal1 <- shiftGAlignmentsList(gal, outbam=shiftedBamfile)
## TSS enrichment score
txs <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
newStyle <- mapSeqlevels(seqlevels(txs), "ENSEMBL")
hg83.txs <- renameSeqlevels(txs, newStyle)
tsse <- TSSEscore(gal1, txs)
tsse$TSSEscore
plot(100*(-9:10-.5), tsse$values, type="b",
xlab="distance to TSS",
ylab="aggregate TSS score")
## plot correlations for multiple samples
ath <- system.file("extdata", package="ATACseqQC", mustWork=TRUE)
bamfiles <- dir(/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam, "*.bam$", full.name=TRUE)
gals <- lapply(bamfiles, function(bamfile){
readBamFile(bamFile=bamfile, tag=character(0),
which=GRanges("chr1", IRanges(1, 1e6)),
asMates=FALSE)
})
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txs <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(GenomicAlignments)
plotCorrelation(GAlignmentsList(gals), txs, seqlev="chr1")
https://greenleaflab.github.io/ChrAccR/articles/overview.html
# cut -f 1 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files/atac.csaw.key.rmdups.txt | sed 's/R1.//g' > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files/atac.csaw.key.rmdups.samplename.txt
library(ChrAccR)
library(ggplot2)
# use a grid-less theme
theme_set(muRtools::theme_nogrid())
sampleAnnotFn <- file.path("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files/", "atac.csaw.key.rmdups.samplename.txt")
bamDir <- file.path("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam")
sampleAnnot <- read.table(sampleAnnotFn, sep="\t", header=TRUE, stringsAsFactors=FALSE)
colnames(sampleAnnot) <- c("bamFilename", "cell.type")
library(tidyr)
sampleAnnotID <- separate(sampleAnnot, cell.type, c("condition", "cell.type"))
# add a column that ChrAccR can use to find the correct bam file for each sample
sampleAnnot[,"bamFilenameFull"] <- file.path(bamDir, sampleAnnotID[,"bamFilename"])
setConfigElement("annotationColumns", c("cell.type", "condition"))
setConfigElement("colorSchemes", c(
getConfigElement("colorSchemes"),
list(
"condition"=c("control"="royalblue", "heroin"="tomato"),
"cell.type"=c("gaba"="purple3", "glu"="blue3", "olig"="seagreen3")
)
))
setConfigElement("filteringCovgCount", 1L)
setConfigElement("filteringCovgReqSamples", 0.25)
setConfigElement("filteringSexChroms", TRUE)
setConfigElement("normalizationMethod", "quantile")
diffCompNames <- c(
"control vs heroin [sampleGroup]",
"gaba vs glu [sampleGroup]",
"gaba vs olig [sampleGroup]",
"glu vs olig [sampleGroup]"
)
setConfigElement("differentialCompNames", diffCompNames)
# prepare a GRanges object of TSS coordinates
tssGr <- muRtools::getAnnotGrl.gencode("gencode.v27")[["gene"]]
tssGr <- tssGr[elementMetadata(tssGr)[,"gene_type"]=="protein_coding"]
tssGr <- promoters(tssGr, upstream=0, downstream=1)
tssGr
# compute TSS enrichment
tsse <- getTssEnrichment(dsa, "TCD8EM_U_1002", tssGr)
# enrichment score: number of insertions at the TSS
# over number of insertion in background regions
tsse$tssEnrichment
# plot
tsse$plot
–> just run it on the commandline? https://www.biostars.org/p/391041/
#bamfile <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.rmdups.bam"
# # Summit
# source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
# conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
# #conda install -c biobuilds bedops # not working
# conda config --add channels r
# conda config --add channels defaults
# conda config --add channels conda-forge
# conda config --add channels bioconda
# conda install bedops
#
#
# # Andes
# module load python
# source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda install -c bioconda bedops # not working
#
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/
# # https://github.com/bedops/bedops/releases
# git clone https://github.com/bedops/bedops.git
# cd bedops
# make
# make install # why is this not processing all of the bins?
# scp bedops-2.4.40.tar.gz noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/.
# tar -xf bedops-2.4.40.tar.gz
## can't get conda to install bedops... use Piet's environment instead (Andes) --> have to force conda-forge
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
#conda activate /gpfs/alpine/syb105/proj-shared/piet/conda-environments/andes-gwas
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs
# conda update -n base conda
# conda create --name andes-bedops python=3.5
conda activate andes-bedops
# conda install -c conda-forge -c bioconda bedops=2.4.39=h7d875b9_1
# conda install -c conda-forge -c bioconda ucsc-fetchchromsizes
# conda install -c conda-forge -c bioconda subread
# conda install -c conda-forge -c bioconda bedtools
# First get hg38 refSeq annotations (Credit to Alex Reynolds , https://www.biostars.org/p/250091/#250304)
#wget -qO- http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz | gunzip -c - | awk 'BEGIN{ OFS="\t" }{ print $3, $5, $6, $13, $10, $4 }' - | sort-bed - > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/refGene.bed
##### Need to be using the reference I used to generate the bam files --> NCBI not UCSC
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/
awk 'BEGIN{ OFS="\t" }{ print $1, $4, $5, $9, $8, $7 }' GCF_000001405.39_GRCh38.p13_genomic.gene.gtf | sort-bed - > refGene.bed
# Then split them by strand and pad around the stranded-start position of the annotation (Take areas around TSS -/+ 1000)
awk '($6 == "+") { print $0 }' refGene.bed | awk 'BEGIN{ OFS="\t" }($2 > 1000){ print $1, ($2 - 1000), ($2 + 1000), $4, $5, $6 }' > refGene.tss.for.padded.bed
awk '($6 == "-") { print $0 }' refGene.bed | awk 'BEGIN{ OFS="\t" }($3 > 1000){ print $1, ($3 - 1000), ($3 + 1000), $4, $5, $6 }' > refGene.tss.rev.padded.bed
bedops --everything refGene.tss.for.padded.bed refGene.tss.rev.padded.bed > refGene.tss.padded.bed
# Keep only TSS regions within chromosomal bounds.
#fetchChromSizes hg38 | awk '{ print $1"\t0\t"$2; }' | sort-bed - > hg38.bounds.bed
awk '{ print $1"\t0\t"$2; }' GRCh38.p13.chr.size | sort-bed - > hg38.bounds.bed
bedops --element-of 100% refGene.tss.padded.bed hg38.bounds.bed > refGene.tss.padded.filtered.bed
# convert to SAF
awk 'BEGIN{FS=OFS="\t"; print "GeneID\tChr\tStart\tEnd\tStrand"}{print $4, $1, $2+1, $3, "."}' refGene.tss.padded.filtered.bed > refGene.tss.padded.filtered.bed.saf
# Count reads using FeatureCounts.
featureCounts -T 6 -a refGene.tss.padded.filtered.bed.saf -F SAF -o readCountInPeaks.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.rmdups.bam
# Record number of successful alignments to the TSS
# || Total alignments : 104501732 ||
# || Successfully assigned alignments : 6921672 (6.6%) ||
# Second part is to do the same thing but on different regions (100bp -/+ of TSS regions)
# Use the same file for the TSS regions Use flank from bedtools to get these 100bp -/+ regions (with the exclusion of the TSS region) Flank tool requires a genome file (different from the one you used to get the TSS) to get the 100bp -/+ First you need a genome file defining the length of each chromosome or contig.
#wget http://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.fa.gz
#gunzip hg38.fa.gz
#samtools faidx hg38.fa
#cut -f 1,2 hg38.fa.fai > hg38.ucsc.chrom.sizes #(Credit to igor, https://www.biostars.org/p/206140/#206169)
# Use flank from bedtools
bedtools flank -i refGene.tss.padded.filtered.bed -g GRCh38.p13.chr.size -b 100 > Flanks_100_up_down.bed
# Convert to SAF
awk 'BEGIN{FS=OFS="\t"; print "GeneID\tChr\tStart\tEnd\tStrand"}{print $4, $1, $2+1, $3, "."}' Flanks_100_up_down.bed > Flanks_100_up_down.bed.saf
# Run with featurecounts
featureCounts -T 6 -a Flanks_100_up_down.bed.saf -F SAF -o readCountInPeaks.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.rmdups.bam
# Record number of successful alignments to the 100bp -/+
# || Total alignments : 104501732 ||
# || Successfully assigned alignments : 887315 (0.8%) ||
# To get TSS Enrichemnt score Divide total number of successful alignments to the TSS by total number of successful alignments to the 100bp -/+
## 6921672 / 887315 = 7.800693
# For all files in directory:
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/
# Count reads using FeatureCounts (TSS)
featureCounts -T 6 -a refGene.tss.padded.filtered.bed.saf -F SAF -o readCountInPeaks.TSS.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/*rmdups.bam
# Run with featurecounts (100bp -/+)
featureCounts -T 6 -a Flanks_100_up_down.bed.saf -F SAF -o readCountInPeaks.100bp.txt /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam/*rmdups.bam
# R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/")
tss <- read.delim("readCountInPeaks.TSS.txt.summary", header=T, sep="\t")
hundred <- read.delim("readCountInPeaks.100bp.txt.summary", header=T, sep="\t")
library(stringr)
tss.df <- as.data.frame(t(tss))
colnames(tss.df) <- tss.df[1,]
tss.id <- data.frame(sample = gsub("X.gpfs.alpine.syb105.proj.shared.Personal.noshayjm.projects.opioid.atac.bwa.output.rmdups.bam.", "", rownames(tss.df)))
tss.df$sample <- tss.id
tss.label <- tss.df[2:nrow(tss.df),c(15,1)]
hundred.df <- as.data.frame(t(hundred))
colnames(hundred.df) <- hundred.df[1,]
hundred.id <- data.frame(sample = gsub("X.gpfs.alpine.syb105.proj.shared.Personal.noshayjm.projects.opioid.atac.bwa.output.rmdups.bam.", "", rownames(hundred.df)))
hundred.df$sample <- hundred.id
hundred.label <- hundred.df[2:nrow(hundred.df),c(15,1)]
library(dplyr)
tss.hundred <- left_join(tss.label, hundred.label, by="sample")
tss.hundred$TSS.align <- as.numeric(tss.hundred$Assigned.x)
tss.hundred$hundred.align <- as.numeric(tss.hundred$Assigned.y)
tss.hundred$TSS.enrichment <- tss.hundred$TSS.align / tss.hundred$hundred.align
tss.values <- as.data.frame(tss.hundred[,c(1,4:6)])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.bam")
write.table(tss.values, "TSS.enrichment.txt", quote=F, row.names=F, sep="\t")
Transcription start site (TSS) enrichment values are dependent on the reference files used; cutoff values for high quality data are listed below.
GRCh38 Refseq TSS annotation
below 5 Concerning 5 - 7 Acceptable Above 7 Ideal https://www.encodeproject.org/atac-seq/
# # scp /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/ATAC.All.Metadata.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/.
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(DESeq2)
library(umap)
library(ggplot2)
library(magrittr)
library(dplyr)
# Set the seed so our results are reproducible:
set.seed(12345)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
metadata <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
library(tidyr)
meta <- separate(metadata, Sample, c("Sample.ID", "extension"), sep="_")
meta.df <- meta[,c(1,4:ncol(meta))]
meta.sample <- subset(meta.df, meta.df$Sample.ID %in% samples$Sample.ID)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.narrowpeak.peakset.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:151]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = meta.sample, # annotation data for the samples in the counts data frame
design = ~ Celltype + GROUP + AGE
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(meta.sample, by = "Sample.ID")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.umap.outliers.narrowpeak.metadata.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = GROUP, shape = Celltype)) + geom_point(size=3) + theme_classic()
dev.off()
#### differential
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.peakset.narrowpeak.differential.condition.tissue.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:151]
counts.mat <- round(counts.df)
dds <- DESeqDataSetFromMatrix(
countData = counts.mat, # the counts values for all samples in our dataset
colData = samples, # annotation data for the samples in the counts data frame
design = ~ Condition + Tissue
)
# Normalize and transform the data in the `DESeqDataSet` object
# using the `vst()` function from the `DESeq2` R package
dds_norm <- vst(dds)
# First we are going to retrieve the normalized data
# from the `DESeqDataSet` object using the `assay()` function
normalized_counts <- assay(dds_norm) %>%
t() # We need to transpose this data so each row is a sample
# Now perform UMAP on the normalized data
umap_results <- umap::umap(normalized_counts)
# Make into data frame for plotting with `ggplot2`
# The UMAP values we need for plotting are stored in the `layout` element
umap_plot_df <- data.frame(umap_results$layout) %>%
# Turn sample IDs stored as row names into a column
# tibble::rownames_to_column("Sample.ID") %>%
mutate(Sample.ID = samples$Sample.ID) %>%
# Add the metadata into this data frame; match by sample IDs
inner_join(meta.df, by = "Sample.ID")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
pdf("dba.hmmratac.all.umap.outliers.narrowpeak.differential.pdf")
ggplot(umap_plot_df, aes(x = X1, y = X2, color = GROUP, shape = Celltype)) + geom_point(size=3) + theme_classic()
dev.off()
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(DESeq2)
library(umap)
library(ggplot2)
library(magrittr)
library(dplyr)
# Set the seed so our results are reproducible:
set.seed(12345)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
metadata <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
library(tidyr)
meta <- separate(metadata, Sample, c("Sample.ID", "extension"), sep="_")
meta.df <- meta[,c(1,4:ncol(meta))]
meta.sample <- subset(meta.df, meta.df$Sample.ID %in% samples$Sample.ID)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.narrowpeak.peakset.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:151]
counts.mat <- round(counts.df)
# PCA
library(devtools)
counts.mat.t <- as.data.frame(t(counts.mat))
counts.metadata <- cbind(meta.sample, counts.mat.t)
project.pca <- prcomp(counts.mat.t)
summary(project.pca)
project.pca.proportionvariances <- ((project.pca$sdev^2) / (sum(project.pca$sdev^2)))*100
# PCA mix data
library(PCAmixdata) # can't do PCA on continuos and discrete variables
split <- splitmix(counts.metadata)
X1 <- split$X.quanti
X2 <- split$X.quali
res.pcamix <- PCAmix(X.quanti=X1, X.quali=X2,rename.level=TRUE,graph=FALSE)
res.pcamix
res.pcamix$eig
# scater: getVarianceExplained(x, variables)
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("scater")
library(scater)
var.df <- getVarianceExplained(counts.mat, meta.sample)
write.table(var.df, "atac.variance.explained.txt", quote=F, row.names=F, sep="\t")
colMeans(var.df)
# Sample.ID Celltype Batch GROUP SEX
# NaN 1.3959392 3.7945439 28.6169342 6.5496729
# AGE CAUSE.OF.DEATH Heroin.History Tissue.Source Tissue.type
# 10.3905897 36.8970530 26.2908165 5.5327659 0.5656307
# pH
# 1.5623352
summary(var.df)
# Sample.ID Celltype Batch GROUP
# Min. : NA Min. : 0.000009 Min. : 0.0000 Min. : 0.00001
# 1st Qu.: NA 1st Qu.: 0.506796 1st Qu.: 0.8756 1st Qu.:17.44414
# Median : NA Median : 1.134975 Median : 2.7213 Median :31.49772
# Mean :NaN Mean : 1.395939 Mean : 3.7945 Mean :28.61693
# 3rd Qu.: NA 3rd Qu.: 2.005226 3rd Qu.: 6.3288 3rd Qu.:39.44948
# Max. : NA Max. :12.060844 Max. :19.4618 Max. :68.77388
# NA's :145688
# SEX AGE CAUSE.OF.DEATH Heroin.History
# Min. : 0.00022 Min. : 0.000 Min. : 2.147 Min. : 2.002
# 1st Qu.: 2.02813 1st Qu.: 4.754 1st Qu.:26.585 1st Qu.:18.938
# Median : 4.28787 Median :10.601 Median :37.509 Median :25.684
# Mean : 6.54967 Mean :10.391 Mean :36.897 Mean :26.291
# 3rd Qu.: 7.13518 3rd Qu.:15.108 3rd Qu.:46.600 3rd Qu.:32.914
# Max. :49.42049 Max. :34.724 Max. :77.606 Max. :77.000
#
# Tissue.Source Tissue.type pH
# Min. : 0.000 Min. : 0.00000 Min. : 0.0000
# 1st Qu.: 1.077 1st Qu.: 0.05186 1st Qu.: 0.3786
# Median : 3.342 Median : 0.23844 Median : 1.0681
# Mean : 5.533 Mean : 0.56563 Mean : 1.5623
# 3rd Qu.: 5.930 3rd Qu.: 0.71409 3rd Qu.: 2.2940
# Max. :47.882 Max. :13.14798 Max. :17.4989
# > unique(meta.sample$Batch)
# [1] "A" "B"
# > unique(meta.sample$GROUP)
# [1] "CONTROL" "Control" "HEROIN"
# > unique(meta.sample$SEX)
# [1] "m" "f" "M" "F"
# > unique(meta.sample$AGE)
# [1] 46 65 44 49 53 48 58 20 22 30 36 45 43 15 31 19 27 38 26 24 35 18 23 32 25
# [26] 29 28 21
# > unique(meta.sample$CAUSE.OF.DEATH)
# [1] "heart failure"
# [2] "sudden,heart"
# [3] "cardiac failure"
# [4] "sudden death"
# [5] "sudden, natural"
# [6] "cardiac failure. pulm. embolus. pneumonia"
# [7] "juvenile sudden death. viral infection"
# [8] "electric shock"
# [9] "viral infection"
# [10] "0"
# [11] "Stabbing (homicide)"
# [12] "Gunshot wound (homicide)"
# [13] ""
# [14] "unknown"
# [15] "heroin overdose"
# [16] "heroin overdose ; perhaps sucide??"
# > unique(meta.sample$HEROIN.HISTORY)
# NULL
# > unique(meta.sample$Tissue.Source)
# [1] "Hurd" "Columbia"
# > unique(meta.sample$Tissue.type)
# [1] "powder" "section"
# > unique(meta.sample$pH)
# [1] 6.840000 NA 6.850000 6.920000 6.590000 6.843333 6.790000 6.150000
# [9] 6.330000 6.820000 6.450000 6.400000 6.810000 6.980000 6.750000 6.520000
# [17] 6.870000 6.700000 6.500000 6.480000 6.570000 6.120000 6.200000 6.660000
# [25] 6.640000 6.470000 6.710000 6.610000 6.360000 6.550000 6.310000 6.680000
# [33] 6.420000 6.100000 6.280000 6.380000 6.230000
### try running with each cell type individually...
counts.mat.t <- t(counts.mat)
counts.celltype <- cbind(meta.sample[,1:2], counts.mat.t)
counts.gaba <- subset(counts.celltype, counts.celltype$Celltype == "GABA")
counts.glu <- subset(counts.celltype, counts.celltype$Celltype == "GLU")
counts.olig <- subset(counts.celltype, counts.celltype$Celltype == "OLIG")
counts.gaba.mat <- t(counts.gaba[,3:ncol(counts.gaba)])
counts.glu.mat <- t(counts.glu[,3:ncol(counts.glu)])
counts.olig.mat <- t(counts.olig[,3:ncol(counts.olig)])
meta.gaba <- subset(meta.sample, meta.sample$Celltype == "GABA")
meta.glu <- subset(meta.sample, meta.sample$Celltype == "GLU")
meta.olig <- subset(meta.sample, meta.sample$Celltype == "OLIG")
var.gaba <- getVarianceExplained(counts.gaba.mat, meta.gaba)
var.glu <- getVarianceExplained(counts.glu.mat, meta.glu)
var.olig <- getVarianceExplained(counts.olig.mat, meta.olig)
colMeans(var.gaba)
# Sample.ID Celltype Batch GROUP SEX
# NaN NA 4.457063 32.384960 10.367401
# AGE CAUSE.OF.DEATH Heroin.History Tissue.Source Tissue.type
# 11.657268 46.808184 NaN 6.938715 1.028696
# pH
# 2.518016
colMeans(var.glu)
# Sample.ID Celltype Batch GROUP SEX
# NaN NA 6.3432710 31.1344035 8.4974713
# AGE CAUSE.OF.DEATH Heroin.History Tissue.Source Tissue.type
# 12.1119330 46.9106555 60.3702467 5.8803967 0.8926939
# pH
# 1.8916593
colMeans(var.olig)
# Sample.ID Celltype Batch GROUP SEX
# NaN NA 3.180945 28.597720 9.694432
# AGE CAUSE.OF.DEATH Heroin.History Tissue.Source Tissue.type
# 10.610164 44.663783 62.773498 5.976913 1.580521
# pH
# 2.079588
### try running with each group individually...
counts.mat.t <- t(counts.mat)
counts.group <- cbind(meta.sample[,c(1,4)], counts.mat.t)
counts.control <- subset(counts.group, counts.group$GROUP == "CONTROL" | counts.group$GROUP == "Control")
counts.heroin <- subset(counts.group, counts.group$GROUP == "HEROIN")
counts.control.mat <- t(counts.control[,3:ncol(counts.control)])
counts.heroin.mat <- t(counts.heroin[,3:ncol(counts.heroin)])
meta.control <- subset(meta.sample, meta.sample$GROUP == "CONTROL" | meta.sample$GROUP == "Control")
meta.heroin <- subset(meta.sample, meta.sample$GROUP == "HEROIN")
var.control <- getVarianceExplained(counts.control.mat, meta.control)
var.heroin <- getVarianceExplained(counts.heroin.mat, meta.heroin)
colMeans(var.control)
# Sample.ID Celltype Batch GROUP SEX
# NaN 1.995912 10.042343 19.408458 21.407012
# AGE CAUSE.OF.DEATH Heroin.History Tissue.Source Tissue.type
# 4.025913 36.403666 NA 19.408458 2.260509
# pH
# 1.473673
colMeans(var.heroin)
# Sample.ID Celltype Batch GROUP SEX
# NaN 3.7789546 1.4331098 NA 2.3169467
# AGE CAUSE.OF.DEATH Heroin.History Tissue.Source Tissue.type
# 3.0815670 1.8836725 26.2908165 NA 0.9464765
# pH
# 2.6831792
https://academic.oup.com/nar/article/47/16/e91/5519166
–> Need to work on generating multiple consensus peak sets with varying criteria… what metadata is incorporated (cell type and condition), how much overlap (0.66), how many samples must include the peak (0.33)
run the diffbind consensus peak calling with MACS2 instead of HMMRATAC peaks (DONE)
use a 0.33 threshold instead of 0.66 (DONE)
call differential peaks using DESeq and Lima instead of within DiffBind
gold standard for enhancers –> 27 acetyl, Encode list
test stepwise for data input and thresholds for consensus peaks and differential peaks
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# sed 's/_peaks.gappedPeak.col.bed/_summits.bed.out/g' key.files/diffbind.hmmratac.sample.all.outlier2.csv | sed 's/peaks.coord/peaks.summit/g' > key.files/diffbind.hmmratac.summit.sample.outlier.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.hmmratac.summit.sample.outlier.csv")
head(df$peaks[[1]])
write.table(df$peaks[[1]], "diffbind.peaks.txt", quote=F, row.names=F, sep="\t")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
# 154 Samples, 164756 sites in matrix (264282 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 1 olig control control 1 29032
# 2 2 olig control control 1 26109
# 3 3 olig control control 1 21283
# 4 4 olig control control 1 17434
# 5 5 olig control control 1 26008
# 6 6 olig control control 1 26520
# 7 7 olig control control 1 17306
# 8 8 olig control control 1 6443
# 9 9 olig control control 1 8157
# 10 10 olig control control 1 6417
# 11 11 olig control control 1 9147
# 12 12 olig control control 1 13954
# 13 13 olig control control 1 18657
# 14 14 olig control control 1 18421
# 15 15 olig control control 1 16130
# 16 16 olig control control 1 13823
# 17 17 olig control control 1 23046
# 18 18 olig control control 1 14478
# 19 19 olig control control 1 15281
# 20 20 olig control control 1 43433
# 21 21 olig control control 1 24317
# 22 22 olig control control 1 21075
# 23 23 olig control control 1 18027
# 24 24 olig control control 1 29685
# 25 25 olig heroin heroin 1 22145
# 26 26 olig heroin heroin 1 17825
# 27 27 olig heroin heroin 1 11362
# 28 28 olig heroin heroin 1 16823
# 29 29 olig heroin heroin 1 11474
# 30 30 olig heroin heroin 1 18099
# 31 31 olig heroin heroin 1 22819
# 32 32 olig heroin heroin 1 21158
# 33 33 olig heroin heroin 1 23807
# 34 34 olig heroin heroin 1 21414
# 35 35 olig heroin heroin 1 27465
# 36 36 olig heroin heroin 1 32288
# 37 37 olig heroin heroin 1 18006
# 38 38 olig heroin heroin 1 7793
# 39 39 olig heroin heroin 1 27147
# 40 40 olig heroin heroin 1 16194
# 41 41 olig heroin heroin 1 24684
# 42 42 olig heroin heroin 1 26964
# 43 43 olig heroin heroin 1 29342
# 44 44 olig heroin heroin 1 18771
# 45 45 olig heroin heroin 1 18824
# 46 46 olig heroin heroin 1 31472
# 47 47 olig heroin heroin 1 20327
# 48 48 gaba control control 1 24545
# 49 49 gaba control control 1 32246
# 50 50 gaba control control 1 23614
# 51 51 gaba control control 1 19457
# 52 52 gaba control control 1 42701
# 53 53 gaba control control 1 21587
# 54 54 gaba control control 1 24086
# 55 55 gaba control control 1 9076
# 56 56 gaba control control 1 5998
# 57 57 gaba control control 1 5423
# 58 58 gaba control control 1 34138
# 59 59 gaba control control 1 37469
# 60 60 gaba control control 1 33974
# 61 61 gaba control control 1 16549
# 62 62 gaba control control 1 22743
# 63 63 gaba control control 1 48294
# 64 64 gaba control control 1 23760
# 65 65 gaba control control 1 28689
# 66 66 gaba control control 1 16268
# 67 67 gaba control control 1 24659
# 68 68 gaba control control 1 21780
# 69 69 gaba control control 1 19174
# 70 70 gaba control control 1 14876
# 71 71 gaba control control 1 14740
# 72 72 gaba control control 1 33002
# 73 73 gaba control control 1 30544
# 74 74 gaba heroin heroin 1 35624
# 75 75 gaba heroin heroin 1 32727
# 76 76 gaba heroin heroin 1 6808
# 77 77 gaba heroin heroin 1 8554
# 78 78 gaba heroin heroin 1 22055
# 79 79 gaba heroin heroin 1 19964
# 80 80 gaba heroin heroin 1 36778
# 81 81 gaba heroin heroin 1 32745
# 82 82 gaba heroin heroin 1 17300
# 83 83 gaba heroin heroin 1 41946
# 84 84 gaba heroin heroin 1 13213
# 85 85 gaba heroin heroin 1 3831
# 86 86 gaba heroin heroin 1 11000
# 87 87 gaba heroin heroin 1 15277
# 88 88 gaba heroin heroin 1 21535
# 89 89 gaba heroin heroin 1 24438
# 90 90 gaba heroin heroin 1 24630
# 91 91 gaba heroin heroin 1 35503
# 92 92 gaba heroin heroin 1 9818
# 93 93 gaba heroin heroin 1 10632
# 94 94 gaba heroin heroin 1 11842
# 95 95 gaba heroin heroin 1 28456
# 96 96 gaba heroin heroin 1 8986
# 97 97 gaba heroin heroin 1 22478
# 98 98 glu control control 1 47068
# 99 99 glu control control 1 53067
# 100 100 glu control control 1 74486
# 101 101 glu control control 1 34854
# 102 102 glu control control 1 37625
# 103 103 glu control control 1 64025
# 104 104 glu control control 1 28061
# 105 105 glu control control 1 10506
# 106 106 glu control control 1 32681
# 107 107 glu control control 1 36195
# 108 108 glu control control 1 47090
# 109 109 glu control control 1 60838
# 110 110 glu control control 1 22602
# 111 111 glu control control 1 50575
# 112 112 glu control control 1 47815
# 113 113 glu control control 1 35105
# 114 114 glu control control 1 32028
# 115 115 glu control control 1 30466
# 116 116 glu control control 1 23867
# 117 117 glu control control 1 29987
# 118 118 glu control control 1 33924
# 119 119 glu control control 1 7979
# 120 120 glu control control 1 36745
# 121 121 glu control control 1 136
# 122 122 glu control control 1 46002
# 123 123 glu control control 1 40632
# 124 124 glu heroin heroin 1 41546
# 125 125 glu heroin heroin 1 56011
# 126 126 glu heroin heroin 1 16980
# 127 127 glu heroin heroin 1 42687
# 128 128 glu heroin heroin 1 50208
# 129 129 glu heroin heroin 1 36072
# 130 130 glu heroin heroin 1 54434
# 131 131 glu heroin heroin 1 56322
# 132 132 glu heroin heroin 1 56311
# 133 133 glu heroin heroin 1 65811
# 134 134 glu heroin heroin 1 38417
# 135 135 glu heroin heroin 1 58009
# 136 136 glu heroin heroin 1 24746
# 137 137 glu heroin heroin 1 16008
# 138 138 glu heroin heroin 1 13214
# 139 139 glu heroin heroin 1 24305
# 140 140 glu heroin heroin 1 34243
# 141 141 glu heroin heroin 1 26657
# 142 142 glu heroin heroin 1 42541
# 143 143 glu heroin heroin 1 17302
# 144 144 glu heroin heroin 1 12472
# 145 145 glu heroin heroin 1 15040
# 146 146 glu heroin heroin 1 17516
# 147 147 glu heroin heroin 1 29186
# 148 148 glu heroin heroin 1 32651
# 149 olig:control olig control control 1 11772
# 150 olig:heroin olig heroin heroin 1 13148
# 151 gaba:control gaba control control 1 14197
# 152 gaba:heroin gaba heroin heroin 1 10687
# 153 glu:control glu control control 1 19460
# 154 glu:heroin glu heroin heroin 1 16278
write.table(df.counted$peaks[[154]], "diffbind.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
# 16278
write.table(df.counted$peaks[[153]], "diffbind.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
# 19460
write.table(df.counted$peaks[[152]], "diffbind.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
# 10687
write.table(df.counted$peaks[[151]], "diffbind.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
# 14197
write.table(df.counted$peaks[[150]], "diffbind.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
# 13148
write.table(df.counted$peaks[[149]], "diffbind.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
# 11772
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
# 6 Samples, 20838 sites in matrix (28813 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 11772
# 2 olig:heroin olig heroin heroin 1 13148
# 3 gaba:control gaba control control 1 14197
# 4 gaba:heroin gaba heroin heroin 1 10687
# 5 glu:control glu control control 1 19460
# 6 glu:heroin glu heroin heroin 1 16278
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.66)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
# 2 Samples, 8694 sites in matrix (10403 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 9491
# 2 heroin olig-gaba-glu heroin heroin 1 9606
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.66)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
# 3 Samples, 12026 sites in matrix (24513 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 12168
# 2 gaba gaba control-heroin control-heroin 1 12830
# 3 glu glu control-heroin control-heroin 1 18400
pdf("consensus.condition.overlap.pdf")
dba.plotVenn(df.condition,df.condition$masks$Consensus)
dev.off()
pdf("consensus.cell.overlap.pdf")
dba.plotVenn(df.cell,df.cell$masks$Consensus)
dev.off()
#### differential consensus peaks
library(tidygenomics)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac")
glu.heroin <- read.delim("diffbind.consensus.glu.heroin.txt", header=T, sep="\t")
glu.control <- read.delim("diffbind.consensus.glu.control.txt", header=T, sep="\t")
gaba.heroin <- read.delim("diffbind.consensus.gaba.heroin.txt", header=T, sep="\t")
gaba.control <- read.delim("diffbind.consensus.gaba.control.txt", header=T, sep="\t")
olig.heroin <- read.delim("diffbind.consensus.olig.heroin.txt", header=T, sep="\t")
olig.control <- read.delim("diffbind.consensus.olig.control.txt", header=T, sep="\t")
glu.heroin$glu.heroin.peak <- seq.int(nrow(glu.heroin))
glu.control$glu.control.peak <- seq.int(nrow(glu.control))
gaba.heroin$gaba.heroin.peak <- seq.int(nrow(gaba.heroin))
gaba.control$gaba.control.peak <- seq.int(nrow(gaba.control))
olig.heroin$olig.heroin.peak <- seq.int(nrow(olig.heroin))
olig.control$olig.control.peak <- seq.int(nrow(olig.control))
glu <- genome_intersect(glu.heroin, glu.control, by=c("Chr", "Start", "End"))
glu.heroin.only <- subset(glu.heroin, !(glu.heroin$glu.heroin.peak %in% glu$glu.heroin.peak))
glu.control.only <- subset(glu.control, !(glu.control$glu.control.peak %in% glu$glu.control.peak))
# glu intersect = 14636, heroin only = 1642, control only = 4824
gaba <- genome_intersect(gaba.heroin, gaba.control, by=c("Chr", "Start", "End"))
gaba.heroin.only <- subset(gaba.heroin, !(gaba.heroin$gaba.heroin.peak %in% gaba$gaba.heroin.peak))
gaba.control.only <- subset(gaba.control, !(gaba.control$gaba.control.peak %in% gaba$gaba.control.peak))
# gaba intersect = 10144, heroin only = 543, control only = 4053
olig <- genome_intersect(olig.heroin, olig.control, by=c("Chr", "Start", "End"))
olig.heroin.only <- subset(olig.heroin, !(olig.heroin$olig.heroin.peak %in% olig$olig.heroin.peak))
olig.control.only <- subset(olig.control, !(olig.control$olig.control.peak %in% olig$olig.control.peak))
# olig intersect = 10762, heroin only = 2386, control only = 1010
glu.gaba.heroin <- genome_intersect(glu.heroin.only, gaba.heroin.only, by=c("Chr", "Start", "End"))
glu.gaba.olig.heroin <- genome_intersect(glu.gaba.heroin, olig.heroin.only, by=c("Chr", "Start", "End"))
# 9
glu.gaba.control <- genome_intersect(glu.control.only, gaba.control.only, by=c("Chr", "Start", "End"))
glu.gaba.olig.control <- genome_intersect(glu.gaba.control, olig.control.only, by=c("Chr", "Start", "End"))
# 40
### lower threshold
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.33)
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
df_consensus
# 6 Samples, 48100 sites in matrix (61505 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 21953
# 2 olig:heroin olig heroin heroin 1 23865
# 3 gaba:control gaba control control 1 27288
# 4 gaba:heroin gaba heroin heroin 1 24169
# 5 glu:control glu control control 1 42188
# 6 glu:heroin glu heroin heroin 1 37670
write.table(df.counted$peaks[[154]], "diffbind.consensus.0.3.glu.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[153]], "diffbind.consensus.0.3.glu.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[152]], "diffbind.consensus.0.3.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[151]], "diffbind.consensus.0.3.gaba.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[150]], "diffbind.consensus.0.3.olig.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[149]], "diffbind.consensus.0.3.olig.control.txt", quote=F, row.names=F, sep="\t")
# salloc -A SYB105 -p gpu -N 1 -t 2:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# sed 's/_summits.bed.out/.macs2.bed.gz/g' diffbind.hmmratac.summit.sample.outlier.csv | sed 's/hmmratac/macs.output/g' | sed 's/peaks.summit/peaks/g' > diffbind.macs2.outlier.sample.csv
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.macs2.outlier.sample.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.macs2.outlier.sample.csv")
head(df$peaks[[1]])
# Chr Start End Score
# 1 NC_000001.11 9983 10574 0.027730460
# 2 NC_000001.11 15988 16290 0.001426138
# 3 NC_000001.11 180689 181134 0.012478707
# 4 NC_000001.11 181304 181624 0.004654756
# 5 NC_000001.11 191202 191880 0.193281306
# 6 NC_000001.11 586084 586272 0.003664382
write.table(df$peaks[[1]], "diffbind.MACS2.peaks.txt", quote=F, row.names=F, sep="\t")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
# 154 Samples, 1166714 sites in matrix (1976997 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 1 olig control control 1 127091
# 2 2 olig control control 1 215173
# 3 3 olig control control 1 70257
# 4 4 olig control control 1 170327
# 5 5 olig control control 1 154052
# 6 6 olig control control 1 135875
# 7 7 olig control control 1 82598
# 8 8 olig control control 1 83272
# 9 9 olig control control 1 230958
# 10 10 olig control control 1 151728
# 11 11 olig control control 1 49618
# 12 12 olig control control 1 93979
# 13 13 olig control control 1 96682
# 14 14 olig control control 1 103098
# 15 15 olig control control 1 93969
# 16 16 olig control control 1 93258
# 17 17 olig control control 1 67336
# 18 18 olig control control 1 74100
# 19 19 olig control control 1 89286
# 20 20 olig control control 1 147328
# 21 21 olig control control 1 130113
# 22 22 olig control control 1 89226
# 23 23 olig control control 1 105495
# 24 24 olig control control 1 88295
# 25 25 olig heroin heroin 1 120009
# 26 26 olig heroin heroin 1 211692
# 27 27 olig heroin heroin 1 187822
# 28 28 olig heroin heroin 1 147315
# 29 29 olig heroin heroin 1 92846
# 30 30 olig heroin heroin 1 165164
# 31 31 olig heroin heroin 1 134661
# 32 32 olig heroin heroin 1 87730
# 33 33 olig heroin heroin 1 131573
# 34 34 olig heroin heroin 1 139266
# 35 35 olig heroin heroin 1 261439
# 36 36 olig heroin heroin 1 120885
# 37 37 olig heroin heroin 1 90574
# 38 38 olig heroin heroin 1 139822
# 39 39 olig heroin heroin 1 111912
# 40 40 olig heroin heroin 1 104718
# 41 41 olig heroin heroin 1 99995
# 42 42 olig heroin heroin 1 104045
# 43 43 olig heroin heroin 1 112723
# 44 44 olig heroin heroin 1 122072
# 45 45 olig heroin heroin 1 126334
# 46 46 olig heroin heroin 1 123258
# 47 47 olig heroin heroin 1 133928
# 48 48 gaba control control 1 198303
# 49 49 gaba control control 1 170557
# 50 50 gaba control control 1 241774
# 51 51 gaba control control 1 142237
# 52 52 gaba control control 1 164678
# 53 53 gaba control control 1 206210
# 54 54 gaba control control 1 153739
# 55 55 gaba control control 1 226224
# 56 56 gaba control control 1 180995
# 57 57 gaba control control 1 70535
# 58 58 gaba control control 1 127841
# 59 59 gaba control control 1 170867
# 60 60 gaba control control 1 110087
# 61 61 gaba control control 1 97007
# 62 62 gaba control control 1 114838
# 63 63 gaba control control 1 129186
# 64 64 gaba control control 1 114713
# 65 65 gaba control control 1 97480
# 66 66 gaba control control 1 104188
# 67 67 gaba control control 1 116698
# 68 68 gaba control control 1 128795
# 69 69 gaba control control 1 86500
# 70 70 gaba control control 1 88078
# 71 71 gaba control control 1 85230
# 72 72 gaba control control 1 125732
# 73 73 gaba control control 1 113273
# 74 74 gaba heroin heroin 1 134045
# 75 75 gaba heroin heroin 1 168521
# 76 76 gaba heroin heroin 1 166341
# 77 77 gaba heroin heroin 1 179418
# 78 78 gaba heroin heroin 1 199056
# 79 79 gaba heroin heroin 1 151027
# 80 80 gaba heroin heroin 1 148022
# 81 81 gaba heroin heroin 1 156470
# 82 82 gaba heroin heroin 1 96019
# 83 83 gaba heroin heroin 1 159910
# 84 84 gaba heroin heroin 1 100005
# 85 85 gaba heroin heroin 1 49005
# 86 86 gaba heroin heroin 1 98977
# 87 87 gaba heroin heroin 1 102668
# 88 88 gaba heroin heroin 1 104025
# 89 89 gaba heroin heroin 1 112441
# 90 90 gaba heroin heroin 1 110148
# 91 91 gaba heroin heroin 1 129107
# 92 92 gaba heroin heroin 1 74061
# 93 93 gaba heroin heroin 1 99082
# 94 94 gaba heroin heroin 1 89095
# 95 95 gaba heroin heroin 1 107037
# 96 96 gaba heroin heroin 1 67641
# 97 97 gaba heroin heroin 1 136514
# 98 98 glu control control 1 220004
# 99 99 glu control control 1 191250
# 100 100 glu control control 1 184671
# 101 101 glu control control 1 170434
# 102 102 glu control control 1 152497
# 103 103 glu control control 1 201937
# 104 104 glu control control 1 180494
# 105 105 glu control control 1 174604
# 106 106 glu control control 1 198247
# 107 107 glu control control 1 156688
# 108 108 glu control control 1 166867
# 109 109 glu control control 1 181517
# 110 110 glu control control 1 137010
# 111 111 glu control control 1 165303
# 112 112 glu control control 1 152910
# 113 113 glu control control 1 164281
# 114 114 glu control control 1 134114
# 115 115 glu control control 1 146795
# 116 116 glu control control 1 133616
# 117 117 glu control control 1 137460
# 118 118 glu control control 1 152377
# 119 119 glu control control 1 82867
# 120 120 glu control control 1 147951
# 121 121 glu control control 1 126069
# 122 122 glu control control 1 166491
# 123 123 glu control control 1 145976
# 124 124 glu heroin heroin 1 154360
# 125 125 glu heroin heroin 1 182313
# 126 126 glu heroin heroin 1 102994
# 127 127 glu heroin heroin 1 173161
# 128 128 glu heroin heroin 1 179822
# 129 129 glu heroin heroin 1 167924
# 130 130 glu heroin heroin 1 187327
# 131 131 glu heroin heroin 1 197371
# 132 132 glu heroin heroin 1 186815
# 133 133 glu heroin heroin 1 178205
# 134 134 glu heroin heroin 1 171950
# 135 135 glu heroin heroin 1 177893
# 136 136 glu heroin heroin 1 107781
# 137 137 glu heroin heroin 1 135992
# 138 138 glu heroin heroin 1 121637
# 139 139 glu heroin heroin 1 154201
# 140 140 glu heroin heroin 1 147983
# 141 141 glu heroin heroin 1 147052
# 142 142 glu heroin heroin 1 168592
# 143 143 glu heroin heroin 1 104150
# 144 144 glu heroin heroin 1 108562
# 145 145 glu heroin heroin 1 108496
# 146 146 glu heroin heroin 1 124786
# 147 147 glu heroin heroin 1 151751
# 148 148 glu heroin heroin 1 142242
# 149 olig:control olig control control 1 38060
# 150 olig:heroin olig heroin heroin 1 46054
# 151 gaba:control gaba control control 1 51337
# 152 gaba:heroin gaba heroin heroin 1 47329
# 153 glu:control glu control control 1 83915
# 154 glu:heroin glu heroin heroin 1 78655
write.table(df.counted$peaks[[154]], "diffbind.MACS2.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[153]], "diffbind.MACS2.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[152]], "diffbind.MACS2.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[151]], "diffbind.MACS2.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[150]], "diffbind.MACS2.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[149]], "diffbind.MACS2.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
# 6 Samples, 94386 sites in matrix (117381 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 38060
# 2 olig:heroin olig heroin heroin 1 46054
# 3 gaba:control gaba control control 1 51337
# 4 gaba:heroin gaba heroin heroin 1 47329
# 5 glu:control glu control control 1 83915
# 6 glu:heroin glu heroin heroin 1 78655
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.66)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
# 2 Samples, 31871 sites in matrix (37453 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 34208
# 2 heroin olig-gaba-glu heroin heroin 1 35410
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.66)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
# 3 Samples, 43541 sites in matrix (106019 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 40610
# 2 gaba gaba control-heroin control-heroin 1 50095
# 3 glu glu control-heroin control-heroin 1 82244
pdf("consensus.MACS2.condition.overlap.pdf")
dba.plotVenn(df.condition,df.condition$masks$Consensus)
dev.off()
pdf("consensus.MACS2.cell.overlap.pdf")
dba.plotVenn(df.cell,df.cell$masks$Consensus)
dev.off()
## use 33% overlap instead of 66%
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.33)
df.counted
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
df_consensus
# 6 Samples, 169765 sites in matrix (212620 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 75160
# 2 olig:heroin olig heroin heroin 1 87428
# 3 gaba:control gaba control control 1 95067
# 4 gaba:heroin gaba heroin heroin 1 89202
# 5 glu:control glu control control 1 136046
# 6 glu:heroin glu heroin heroin 1 132407
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.33)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
df.condition.consensus
# 2 Samples, 84906 sites in matrix (106578 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 97696
# 2 heroin olig-gaba-glu heroin heroin 1 95423
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.33)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
df.cell.consensus
# 3 Samples, 75276 sites in matrix (187878 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 78192
# 2 gaba gaba control-heroin control-heroin 1 88827
# 3 glu glu control-heroin control-heroin 1 134767
#### differential consensus peaks
library(tidygenomics)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac")
glu.heroin <- read.delim("diffbind.MACS2.consensus.glu.heroin.txt", header=T, sep="\t")
glu.control <- read.delim("diffbind.MACS2.consensus.glu.control.txt", header=T, sep="\t")
gaba.heroin <- read.delim("diffbind.MACS2.consensus.gaba.heroin.txt", header=T, sep="\t")
gaba.control <- read.delim("diffbind.MACS2.consensus.gaba.control.txt", header=T, sep="\t")
olig.heroin <- read.delim("diffbind.MACS2.consensus.olig.heroin.txt", header=T, sep="\t")
olig.control <- read.delim("diffbind.MACS2.consensus.olig.control.txt", header=T, sep="\t")
glu.heroin$glu.heroin.peak <- seq.int(nrow(glu.heroin))
glu.control$glu.control.peak <- seq.int(nrow(glu.control))
gaba.heroin$gaba.heroin.peak <- seq.int(nrow(gaba.heroin))
gaba.control$gaba.control.peak <- seq.int(nrow(gaba.control))
olig.heroin$olig.heroin.peak <- seq.int(nrow(olig.heroin))
olig.control$olig.control.peak <- seq.int(nrow(olig.control))
glu <- genome_intersect(glu.heroin, glu.control, by=c("Chr", "Start", "End"))
glu.heroin.only <- subset(glu.heroin, !(glu.heroin$glu.heroin.peak %in% glu$glu.heroin.peak))
glu.control.only <- subset(glu.control, !(glu.control$glu.control.peak %in% glu$glu.control.peak))
# glu intersect = 73580, heroin only = 5706, control only = 10848
gaba <- genome_intersect(gaba.heroin, gaba.control, by=c("Chr", "Start", "End"))
gaba.heroin.only <- subset(gaba.heroin, !(gaba.heroin$gaba.heroin.peak %in% gaba$gaba.heroin.peak))
gaba.control.only <- subset(gaba.control, !(gaba.control$gaba.control.peak %in% gaba$gaba.control.peak))
# gaba intersect = 44334, heroin only = 3169, control only = 7319
olig <- genome_intersect(olig.heroin, olig.control, by=c("Chr", "Start", "End"))
olig.heroin.only <- subset(olig.heroin, !(olig.heroin$olig.heroin.peak %in% olig$olig.heroin.peak))
olig.control.only <- subset(olig.control, !(olig.control$olig.control.peak %in% olig$olig.control.peak))
# olig intersect = 37224, heroin only = 8987, control only = 1164
glu.gaba.heroin <- genome_intersect(glu.heroin.only, gaba.heroin.only, by=c("Chr", "Start", "End"))
glu.gaba.olig.heroin <- genome_intersect(glu.gaba.heroin, olig.heroin.only, by=c("Chr", "Start", "End"))
# 15
glu.gaba.control <- genome_intersect(glu.control.only, gaba.control.only, by=c("Chr", "Start", "End"))
glu.gaba.olig.control <- genome_intersect(glu.gaba.control, olig.control.only, by=c("Chr", "Start", "End"))
# 7
### Differential - Condition + Tissue
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df.counted <- dba(df)
df.counted <- dba.contrast(df.counted, categories=c(DBA_CONDITION,DBA_TISSUE))
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
df.counted <- dba.contrast(df.counted, categories=DBA_CONDITION+DBA_TISSUE)
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
# ####### trying to make smaller consensus peaks (currently mean ~ 900bp)
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks
# gunzip *.gz
# for i in *.bed; do
# awk '{ sum += ($3-$2); n++ } END { if (n > 0) print sum / n; }' $i
# done
# # all input peak files have average peak size ~250-400bp
# ## so why are the consensus peak sets so large?
#
#
#
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
# module load python
# source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
#
# R
#
# library(DiffBind)
# library(tidyverse)
#
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#
# samples <- read.csv("key.files/diffbind.macs2.outlier.sample.csv")
# names(samples)
#
# df <- dba(sampleSheet="key.files/diffbind.macs2.outlier.sample.csv")
#
# df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
# dba(df.counted, mask = df.counted$masks$Consensus)
# # ID Tissue Condition Treatment Replicate Intervals
# # 1 olig:control olig control control 1 38060
# # 2 olig:heroin olig heroin heroin 1 46054
# # 3 gaba:control gaba control control 1 51337
# # 4 gaba:heroin gaba heroin heroin 1 47329
# # 5 glu:control glu control control 1 83915
# # 6 glu:heroin glu heroin heroin 1 78655
# mean(df.counted$peaks[[154]]$End - df.counted$peaks[[154]]$Start)
# # 905.3869
# df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=1.00)
# dba(df.counted, mask = df.counted$masks$Consensus)
# # ID Tissue Condition Treatment Replicate Intervals
# # 1 olig:control olig control control 1 881015
# # 2 olig:heroin olig heroin heroin 1 950013
# # 3 gaba:control gaba control control 1 960194
# # 4 gaba:heroin gaba heroin heroin 1 861273
# # 5 glu:control glu control control 1 866982
# # 6 glu:heroin glu heroin heroin 1 747306
# mean(df.counted$peaks[[154]]$End - df.counted$peaks[[154]]$Start)
# # 339.6303
#
# # write.table(df.counted$peaks[[154]], "diffbind.MACS2.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
# # write.table(df.counted$peaks[[153]], "diffbind.MACS2.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
# # write.table(df.counted$peaks[[152]], "diffbind.MACS2.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
# # write.table(df.counted$peaks[[151]], "diffbind.MACS2.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
# # write.table(df.counted$peaks[[150]], "diffbind.MACS2.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
# # write.table(df.counted$peaks[[149]], "diffbind.MACS2.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
#
# df.counted = dba.peakset(df, minOverlap=1.00)
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J diffbind
#SBATCH -N 2
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
R CMD BATCH macs2.diffbind.differential.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs2.diffbind.differential.sh
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.macs2.outlier.sample.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.macs2.outlier.sample.csv")
pdf("diffbind.macs2.plot.pdf")
plot(df)
dev.off()
df.count <- dba.count(df)
df.count
info <- dba.show(df.count)
libsizes <- cbind(LibReads=info$Reads, FRiP=info$FRiP,PeakReads=round(info$Reads * info$FRiP))
rownames(libsizes) <- info$ID
libsizes
pdf("diffbind.macs2.plot2.pdf")
plot(df.count)
dev.off()
df.norm <- dba.normalize(df.count)
norm <- dba.normalize(df.norm, bRetrieve=TRUE)
norm
normlibs <- cbind(FullLibSize=norm$lib.sizes, NormFacs=norm$norm.factors, NormLibSize=round(norm$lib.sizes/norm$norm.factors))
rownames(normlibs) <- info$ID
normlibs
df.contrast <- dba.contrast(df.norm, design="~Tissue + Condition")
df.contrast
df.analyze <- dba.analyze(df.contrast)
dba.show(df.analyze, bContrasts=TRUE)
pdf("diffbind.macs2.plot3.pdf")
dba.plotPCA(df.analyze,DBA_TISSUE,label=DBA_CONDITION)
dev.off()
pdf("diffbind.macs2.plot4.pdf")
dba.plotMA(df.analyze)
dev.off()
pdf("diffbind.macs2.plot5.pdf")
dba.plotVolcano(df.analyze)
dev.off()
multifactor.DB <- dba.report(df.analyze)
min(abs(multifactor.DB$Fold))
multifactor.DB
df.analyze <- dba.analyze(df.contrast,method=DBA_ALL_METHODS)
dba.show(df.analyze,bContrasts=TRUE)
profiles <- dba.plotProfile(df.analyze)
pdf("diffbind.macs2.plot6.pdf")
dba.plotProfile(profiles)
dev.off()
profiles <- dba.plotProfile(df.analyze,merge=c(DBA_TISSUE, DBA_CONDITION))
pdf("diffbind.macs2.plot7.pdf")
dba.plotProfile(profiles)
dev.off()
### add in additional co-variants (age, sex)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
df <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
library(tidyr)
df$bamReads <- paste0("bwa.output/rmdups.bam/", df$Sample, ".rmdups.bam")
df$Peaks <- paste0("bwa.output/macs.output/peaks/", df$Sample, ".macs2.bed.gz")
df$PeakCaller <- "bed"
df$sample <- df$Sample
df2 <- separate(df, sample, c("Sample.ID", "extra1", "extra2", "extra3", "extra4"), sep="_")
df3 <- df2[,c(16,3,5,6,7,13,14,15)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
write.table(df3, "diffbind.macs2.outlier.sample.factors.csv", row.names=F, quote=T, sep=",")
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J diffbind
#SBATCH -N 2
#SBATCH -t 2:00:00
#SBATCH --mem-per-cpu=0
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
R CMD BATCH macs2.diffbind.differential.factors.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/macs2.diffbind.differential.factors.sh
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.macs2.outlier.sample.factors.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.macs2.outlier.sample.factors.csv")
df.count <- dba.count(df)
df.norm <- dba.normalize(df.count)
norm <- dba.normalize(df.norm, bRetrieve=TRUE)
df.contrast <- dba.contrast(df.norm, design="~Celltype + GROUP + SEX + AGE")
df.contrast
df.analyze <- dba.analyze(df.contrast)
dba.show(df.analyze, bContrasts=TRUE)
pdf("diffbind.macs2.pca.factors.pdf")
dba.plotPCA(df.analyze, attributes=c(DBA_GROUP,DBA_SEX, DBA_AGE), label=DBA_CELLTYPE)
dev.off()
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.bed", header=F, sep="\t")
glu.atac.opioid <- read.delim("diffbind.MACS2.consensus.glu.heroin.bed", header=F, sep="\t")
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.bed", header=F, sep="\t")
gaba.atac.opioid <- read.delim("diffbind.MACS2.consensus.gaba.heroin.bed", header=F, sep="\t")
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.bed", header=F, sep="\t")
olig.atac.opioid <- read.delim("diffbind.MACS2.consensus.olig.heroin.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
glu <- read.delim("Glu_intersection.csv", header=T, sep=",")
gaba <- read.delim("Gaba_intersection.csv", header=T, sep=",")
olig <- read.delim("Olig_intersection.csv", header=T, sep=",")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
glu.log2.pval.up <- subset(glu, glu$Significance == "UP" & abs(glu$log2FC) > 2 & glu$Padj < 0.05)
# X Gene Significance log2FC Padj
# 15 15 PRSS35 UP 2.093866 0.003142919
# 76 76 FRMD7 UP 2.700107 0.043355159
glu.log2.pval.down <- subset(glu, glu$Significance == "DOWN" & abs(glu$log2FC) > 2 & glu$Padj < 0.05)
# X Gene Significance log2FC Padj
# 163 163 GNL1_2 DOWN -2.167138 0.02578113
glu.up1 <- subset(gene.id, gene.id$gid == "PRSS35")
glu.up2 <- subset(gene.id, gene.id$gid == "FRMD7")
glu.down <- subset(gene.id, gene.id$gid == "GNL1_2")
glu.up1$chr2 <- glu.up1$chr
glu.up1$start2 <- glu.up1$start
glu.up1$end2 <- glu.up1$end
colnames(glu.atac.control) <- c("chr", "start", "end", "value")
colnames(glu.atac.opioid) <- c("chr2", "start2", "end2", "value")
glu.up1.atac <- genome_intersect(glu.up1, glu.atac.control, by=c("chr", "start", "end"))
# 7 peaks intersect with the gene in the control sample
glu.up1.atac.opioid <- genome_intersect(glu.up1.atac, glu.atac.opioid, by=c("chr2", "start2", "end2"))
# 49 peaks intersect with the gene in the opioid sample
glu.up1$chr2 <- glu.up1$chr
glu.up1$prom.start <- glu.up1$start - 2000
glu.up1$prom.end <- glu.up1$start
glu.up1$prom.start2 <- glu.up1$start - 2000
glu.up1$prom.end2 <- glu.up1$start
colnames(glu.atac.control) <- c("chr", "prom.start", "prom.end", "value")
colnames(glu.atac.opioid) <- c("chr2", "prom.start2", "prom.end2", "value")
glu.up1.atac.promoter <- genome_intersect(glu.up1, glu.atac.control, by=c("chr", "prom.start", "prom.end"))
glu.up1.atac.promoter.opioid <- genome_intersect(glu.up1.atac.promoter, glu.atac.opioid, by=c("chr2", "prom.start2", "prom.end2"))
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.bed", header=F, sep="\t")
glu.atac.opioid <- read.delim("diffbind.MACS2.consensus.glu.heroin.bed", header=F, sep="\t")
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.bed", header=F, sep="\t")
gaba.atac.opioid <- read.delim("diffbind.MACS2.consensus.gaba.heroin.bed", header=F, sep="\t")
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.bed", header=F, sep="\t")
olig.atac.opioid <- read.delim("diffbind.MACS2.consensus.olig.heroin.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
glu.DE <- read.delim("Glu_intersection.csv", header=T, sep=",")
gaba.DE <- read.delim("Gaba_intersection.csv", header=T, sep=",")
olig.DE <- read.delim("Olig_intersection.csv", header=T, sep=",")
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
gaba.TMM <- read.delim("Gaba_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM <- read.delim("Glu_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
olig.TMM <- read.delim("Olig_edgeR_TMM_normalized_counts.csv", header=T, sep=",")
glu.TMM.control <- glu.TMM[,c(1,6)]
glu.TMM.opioid <- glu.TMM[,c(1,9)]
gaba.TMM.control <- gaba.TMM[,c(1,6)]
gaba.TMM.opioid <- gaba.TMM[,c(1,9)]
olig.TMM.control <- olig.TMM[,c(1,6)]
olig.TMM.opioid <- olig.TMM[,c(1,9)]
# glu
colnames(glu.TMM.control) <- c("gid", "glu.276")
colnames(glu.TMM.opioid) <- c("gid", "glu.302")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.id, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.id, by="gid")
glu.atac.control$chr <- glu.atac.control$V1
glu.atac.control$start <- glu.atac.control$V2 - 2000
glu.atac.control$end <- glu.atac.control$V3 + 2000
glu.atac.control.gene <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("chr", "start", "end"))
glu.atac.opioid$chr <- glu.atac.opioid$V1
glu.atac.opioid$start <- glu.atac.opioid$V2 - 2000
glu.atac.opioid$end <- glu.atac.opioid$V3 + 2000
glu.atac.opioid.gene <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("chr", "start", "end"))
glu.control.gene.count <- glu.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.gene.count.uniq <- unique(glu.control.gene.count[,c(6,7,16)])
glu.opioid.gene.count <- glu.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.gene.count.uniq <- unique(glu.opioid.gene.count[,c(6,7,16)])
glu.count <- inner_join(glu.control.gene.count.uniq, glu.opioid.gene.count.uniq, by="gid")
glu.summary <- glu.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
# gaba
colnames(gaba.TMM.control) <- c("gid", "gaba.276")
colnames(gaba.TMM.opioid) <- c("gid", "gaba.302")
gaba.TMM.control.gid <- inner_join(gaba.TMM.control, gene.id, by="gid")
gaba.TMM.opioid.gid <- inner_join(gaba.TMM.opioid, gene.id, by="gid")
gaba.atac.control$chr <- gaba.atac.control$V1
gaba.atac.control$start <- gaba.atac.control$V2 - 2000
gaba.atac.control$end <- gaba.atac.control$V3 + 2000
gaba.atac.control.gene <- genome_intersect(gaba.atac.control, gaba.TMM.control.gid, by=c("chr", "start", "end"))
gaba.atac.opioid$chr <- gaba.atac.opioid$V1
gaba.atac.opioid$start <- gaba.atac.opioid$V2 - 2000
gaba.atac.opioid$end <- gaba.atac.opioid$V3 + 2000
gaba.atac.opioid.gene <- genome_intersect(gaba.atac.opioid, gaba.TMM.opioid.gid, by=c("chr", "start", "end"))
gaba.control.gene.count <- gaba.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
gaba.control.gene.count.uniq <- unique(gaba.control.gene.count[,c(6,7,16)])
gaba.opioid.gene.count <- gaba.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
gaba.opioid.gene.count.uniq <- unique(gaba.opioid.gene.count[,c(6,7,16)])
gaba.count <- inner_join(gaba.control.gene.count.uniq, gaba.opioid.gene.count.uniq, by="gid")
gaba.summary <- gaba.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(gaba.276/gaba.302))
# olig
colnames(olig.TMM.control) <- c("gid", "olig.276")
colnames(olig.TMM.opioid) <- c("gid", "olig.302")
olig.TMM.control.gid <- inner_join(olig.TMM.control, gene.id, by="gid")
olig.TMM.opioid.gid <- inner_join(olig.TMM.opioid, gene.id, by="gid")
olig.atac.control$chr <- olig.atac.control$V1
olig.atac.control$start <- olig.atac.control$V2 - 2000
olig.atac.control$end <- olig.atac.control$V3 + 2000
olig.atac.control.gene <- genome_intersect(olig.atac.control, olig.TMM.control.gid, by=c("chr", "start", "end"))
olig.atac.opioid$chr <- olig.atac.opioid$V1
olig.atac.opioid$start <- olig.atac.opioid$V2 - 2000
olig.atac.opioid$end <- olig.atac.opioid$V3 + 2000
olig.atac.opioid.gene <- genome_intersect(olig.atac.opioid, olig.TMM.opioid.gid, by=c("chr", "start", "end"))
olig.control.gene.count <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
olig.control.gene.count.uniq <- unique(olig.control.gene.count[,c(6,7,16)])
olig.opioid.gene.count <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
olig.opioid.gene.count.uniq <- unique(olig.opioid.gene.count[,c(6,7,16)])
olig.count <- inner_join(olig.control.gene.count.uniq, olig.opioid.gene.count.uniq, by="gid")
olig.summary <- olig.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, peak.diff.percent = ((control.peak.count-opioid.peak.count)/control.peak.count)*100, log2fc = log2(olig.276/olig.302))
glu.df <- glu.summary[,c(1,6,7)]
gaba.df <- gaba.summary[,c(1,6,7)]
olig.df <- olig.summary[,c(1,6,8)]
glu.df$cell_type <- "GLU"
gaba.df$cell_type <- "GABA"
olig.df$cell_type <- "OLIG"
all.df <- rbind(glu.df, gaba.df, olig.df)
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.df, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.df, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.df, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
all.de.df.na <- na.omit(all.de.df[,1:4])
cor(all.de.df.na$peak.diff, all.de.df.na$log2fc)
# -0.003478795
all.de.df.na.olig <- subset(all.de.df.na, all.de.df.na$cell_type == "OLIG")
cor(all.de.df.na.olig$peak.diff, all.de.df.na.olig$log2fc)
# -0.0182298
ggplot(na.omit(all.de.df)) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(na.omit(all.de.df)) + geom_boxplot(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
### look at just association between peak number and expression (test on control olig)
cor(na.omit(olig.summary$olig.276), na.omit(olig.summary$control.peak.count))
# 0.276537
ggplot(na.omit(olig.summary)) + geom_point(aes(x=olig.276, y=control.peak.count)) + theme_classic()
cor(na.omit(olig.summary$olig.302), na.omit(olig.summary$opioid.peak.count))
# 0.2473464
cor(na.omit(gaba.summary$gaba.276), na.omit(gaba.summary$control.peak.count))
# 0.3222366
cor(na.omit(gaba.summary$gaba.302), na.omit(gaba.summary$opioid.peak.count))
# 0.3453775
cor(olig.df.de$peak.diff, olig.df.de$log2fc)
##### try 10kb regions
# glu
colnames(glu.TMM.control) <- c("gid", "glu.276")
colnames(glu.TMM.opioid) <- c("gid", "glu.302")
glu.TMM.control.gid <- inner_join(glu.TMM.control, gene.id, by="gid")
glu.TMM.opioid.gid <- inner_join(glu.TMM.opioid, gene.id, by="gid")
glu.atac.control$chr <- glu.atac.control$V1
glu.atac.control$start <- glu.atac.control$V2 - 10000
glu.atac.control$end <- glu.atac.control$V3 + 10000
glu.atac.control.gene <- genome_intersect(glu.atac.control, glu.TMM.control.gid, by=c("chr", "start", "end"))
glu.atac.opioid$chr <- glu.atac.opioid$V1
glu.atac.opioid$start <- glu.atac.opioid$V2 - 10000
glu.atac.opioid$end <- glu.atac.opioid$V3 + 10000
glu.atac.opioid.gene <- genome_intersect(glu.atac.opioid, glu.TMM.opioid.gid, by=c("chr", "start", "end"))
glu.control.gene.count <- glu.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
glu.control.gene.count.uniq <- unique(glu.control.gene.count[,c(6,7,16)])
glu.opioid.gene.count <- glu.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
glu.opioid.gene.count.uniq <- unique(glu.opioid.gene.count[,c(6,7,16)])
glu.count <- inner_join(glu.control.gene.count.uniq, glu.opioid.gene.count.uniq, by="gid")
glu.summary <- glu.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(glu.276/glu.302))
# gaba
colnames(gaba.TMM.control) <- c("gid", "gaba.276")
colnames(gaba.TMM.opioid) <- c("gid", "gaba.302")
gaba.TMM.control.gid <- inner_join(gaba.TMM.control, gene.id, by="gid")
gaba.TMM.opioid.gid <- inner_join(gaba.TMM.opioid, gene.id, by="gid")
gaba.atac.control$chr <- gaba.atac.control$V1
gaba.atac.control$start <- gaba.atac.control$V2 - 10000
gaba.atac.control$end <- gaba.atac.control$V3 + 10000
gaba.atac.control.gene <- genome_intersect(gaba.atac.control, gaba.TMM.control.gid, by=c("chr", "start", "end"))
gaba.atac.opioid$chr <- gaba.atac.opioid$V1
gaba.atac.opioid$start <- gaba.atac.opioid$V2 - 10000
gaba.atac.opioid$end <- gaba.atac.opioid$V3 + 10000
gaba.atac.opioid.gene <- genome_intersect(gaba.atac.opioid, gaba.TMM.opioid.gid, by=c("chr", "start", "end"))
gaba.control.gene.count <- gaba.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
gaba.control.gene.count.uniq <- unique(gaba.control.gene.count[,c(6,7,16)])
gaba.opioid.gene.count <- gaba.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
gaba.opioid.gene.count.uniq <- unique(gaba.opioid.gene.count[,c(6,7,16)])
gaba.count <- inner_join(gaba.control.gene.count.uniq, gaba.opioid.gene.count.uniq, by="gid")
gaba.summary <- gaba.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(gaba.276/gaba.302))
# olig
colnames(olig.TMM.control) <- c("gid", "olig.276")
colnames(olig.TMM.opioid) <- c("gid", "olig.302")
olig.TMM.control.gid <- inner_join(olig.TMM.control, gene.id, by="gid")
olig.TMM.opioid.gid <- inner_join(olig.TMM.opioid, gene.id, by="gid")
olig.atac.control$chr <- olig.atac.control$V1
olig.atac.control$start <- olig.atac.control$V2 - 10000
olig.atac.control$end <- olig.atac.control$V3 + 10000
olig.atac.control.gene <- genome_intersect(olig.atac.control, olig.TMM.control.gid, by=c("chr", "start", "end"))
olig.atac.opioid$chr <- olig.atac.opioid$V1
olig.atac.opioid$start <- olig.atac.opioid$V2 - 10000
olig.atac.opioid$end <- olig.atac.opioid$V3 + 10000
olig.atac.opioid.gene <- genome_intersect(olig.atac.opioid, olig.TMM.opioid.gid, by=c("chr", "start", "end"))
olig.control.gene.count <- olig.atac.control.gene %>% group_by(gid) %>% mutate(control.peak.count = n())
olig.control.gene.count.uniq <- unique(olig.control.gene.count[,c(6,7,16)])
olig.opioid.gene.count <- olig.atac.opioid.gene %>% group_by(gid) %>% mutate(opioid.peak.count = n())
olig.opioid.gene.count.uniq <- unique(olig.opioid.gene.count[,c(6,7,16)])
olig.count <- inner_join(olig.control.gene.count.uniq, olig.opioid.gene.count.uniq, by="gid")
olig.summary <- olig.count %>% group_by(gid) %>% mutate(peak.diff = control.peak.count-opioid.peak.count, log2fc = log2(olig.276/olig.302))
glu.df <- glu.summary[,c(1,6,7)]
gaba.df <- gaba.summary[,c(1,6,7)]
olig.df <- olig.summary[,c(1,6,7)]
glu.df$cell_type <- "GLU"
gaba.df$cell_type <- "GABA"
olig.df$cell_type <- "OLIG"
all.df <- rbind(glu.df, gaba.df, olig.df)
colnames(glu.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
glu.df.de <- left_join(glu.df, glu.DE, by="gid")
colnames(gaba.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
gaba.df.de <- left_join(gaba.df, gaba.DE, by="gid")
colnames(olig.DE) <- c("row", "gid", "DE.sig", "log2FC", "padj")
olig.df.de <- left_join(olig.df, olig.DE, by="gid")
all.de.df <- rbind(glu.df.de, gaba.df.de, olig.df.de)
ggplot(na.omit(all.de.df)) + geom_point(aes(x=peak.diff, y=log2fc, alpha=0.5)) + facet_grid(cell_type ~ DE.sig) + theme_classic()
ggplot(na.omit(all.de.df)) + geom_boxplot(aes(x=DE.sig, y=peak.diff, alpha=0.5)) + facet_grid(cell_type ~ .) + theme_classic()
### look at just association between peak number and expression (test on control olig)
cor(na.omit(olig.summary$olig.276), na.omit(olig.summary$control.peak.count))
#
ggplot(na.omit(olig.summary)) + geom_point(aes(x=olig.276, y=control.peak.count)) + theme_classic()
cor(na.omit(olig.summary$olig.302), na.omit(olig.summary$opioid.peak.count))
#
cor(na.omit(gaba.summary$gaba.276), na.omit(gaba.summary$control.peak.count))
#
cor(na.omit(gaba.summary$gaba.302), na.omit(gaba.summary$opioid.peak.count))
#
# what if we on categorizing... what is the breakdown of atac signal and expression values
summary(all.de.df$log2fc)
summary(all.de.df$peak.diff)
summary(olig.value.summary$log2fc)
summary(olig.value.summary$peak.log2fc)
# all.de.df = XX
all.de.df.peakdiff5 <- subset(all.de.df, abs(all.de.df$peak.diff) > 5)
#
all.de.df.peakdiff5.count <- all.de.df.peakdiff5 %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
peakdiff5.count <- unique(all.de.df.peakdiff5.count[,c(4,6,9)])
all.de.df.peakdiff10 <- subset(all.de.df, abs(all.de.df$peak.diff) > 10)
#
all.de.df.peakdiff10.count <- all.de.df.peakdiff10 %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
peakdiff10.count <- unique(all.de.df.peakdiff10.count[,c(4,6,9)])
all.de.df.peakdiffno <- subset(all.de.df, abs(all.de.df$peak.diff) <= 5)
#
all.de.df.peakdiffno.count <- all.de.df.peakdiffno %>% group_by(cell_type, DE.sig) %>% mutate(count = n())
peakdiffno.count <- unique(all.de.df.peakdiffno.count[,c(4,6,9)])
# look at just the DE genes???
all.DE <- subset(all.de.df, all.de.df$DE.sig == "UP" | all.de.df$DE.sig == "DOWN")
#
summary(all.DE$peak.diff)
summary(all.DE$log2fc)
all.nonDE <- subset(all.de.df, all.de.df$DE.sig != "UP" & all.de.df$DE.sig != "DOWN")
#
summary(all.nonDE$peak.diff)
summary(all.nonDE$log2fc)
all.DE$type <- "DE"
all.nonDE$type <- "nonDE"
all <- rbind(all.DE, all.nonDE)
ggplot(na.omit(all)) + geom_violin(aes(x=type, y=peak.diff)) + theme_classic()
ggplot(na.omit(all)) + geom_violin(aes(x=type, y=log2fc)) + theme_classic()
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
mkdir bwa.output/macs.output/peaks.sig5
cp bwa.output/macs.output/peaks/* bwa.output/macs.output/peaks.sig5/.
gunzip bwa.output/macs.output/peaks.sig5/*
cd bwa.output/macs.output/peaks.sig5
for i in *.bed; do
awk '{if ($7 >= 5) print $0}' $i > $i.sig5.bed
done
rm *.R1.macs2.bed
gzip *macs2.bed.sig5.bed
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files
sed 's/peaks/peaks.sig5/g' diffbind.macs2.outlier.sample.csv | sed 's/macs2.bed/macs2.bed.sig5.bed/g' > diffbind.macs2.sig5.outlier.sample.csv
# R
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.macs2.sig5.outlier.sample.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.macs2.sig5.outlier.sample.csv")
head(df$peaks[[1]])
# Chr Start End Score
# 1 NC_000001.11 9983 10574 0.027730460
# 2 NC_000001.11 180689 181134 0.012478707
# 3 NC_000001.11 181304 181624 0.004654756
# 4 NC_000001.11 191202 191880 0.193281306
# 5 NC_000001.11 586084 586272 0.003664382
# 6 NC_000001.11 629850 630058 0.010339500
write.table(df$peaks[[1]], "diffbind.MACS2.sig5.peaks.txt", quote=F, row.names=F, sep="\t")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
write.table(df.counted$peaks[[154]], "diffbind.MACS2.sig5.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[153]], "diffbind.MACS2.sig5.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[152]], "diffbind.MACS2.sig5.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[151]], "diffbind.MACS2.sig5.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[150]], "diffbind.MACS2.sig5.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[149]], "diffbind.MACS2.sig5.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
df_consensus
# 6 Samples, 75841 sites in matrix (95866 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 34809
# 2 olig:heroin olig heroin heroin 1 39611
# 3 gaba:control gaba control control 1 40827
# 4 gaba:heroin gaba heroin heroin 1 37286
# 5 glu:control glu control control 1 67449
# 6 glu:heroin glu heroin heroin 1 55603
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.66)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
df.condition.consensus
# 2 Samples, 14988 sites in matrix (17337 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 15973
# 2 heroin olig-gaba-glu heroin heroin 1 16375
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.66)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
df.cell.consensus
# 3 Samples, 19982 sites in matrix (46085 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 22115
# 2 gaba gaba control-heroin control-heroin 1 21922
# 3 glu glu control-heroin control-heroin 1 34069
pdf("consensus.MACS2.sig5.condition.overlap.pdf")
dba.plotVenn(df.condition,df.condition$masks$Consensus)
dev.off()
pdf("consensus.MACS2.sig5.cell.overlap.pdf")
dba.plotVenn(df.cell,df.cell$masks$Consensus)
dev.off()
## use 33% overlap instead of 66%
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.33)
df.counted
write.table(df.counted$peaks[[154]], "diffbind.MACS2.sig5.overlap33.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[153]], "diffbind.MACS2.sig5.overlap33.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[152]], "diffbind.MACS2.sig5.overlap33.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[151]], "diffbind.MACS2.sig5.overlap33.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[150]], "diffbind.MACS2.sig5.overlap33.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[149]], "diffbind.MACS2.sig5.overlap33.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
df_consensus
# 6 Samples, 75841 sites in matrix (95866 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 34809
# 2 olig:heroin olig heroin heroin 1 39611
# 3 gaba:control gaba control control 1 40827
# 4 gaba:heroin gaba heroin heroin 1 37286
# 5 glu:control glu control control 1 67449
# 6 glu:heroin glu heroin heroin 1 55603
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.33)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
df.condition.consensus
# 2 Samples, 32615 sites in matrix (39423 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 37374
# 2 heroin olig-gaba-glu heroin heroin 1 34762
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.33)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
df.cell.consensus
# 3 Samples, 33126 sites in matrix (85431 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 36529
# 2 gaba gaba control-heroin control-heroin 1 38240
# 3 glu glu control-heroin control-heroin 1 62637
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
mkdir bwa.output/macs.output/peaks.sig2
cp bwa.output/macs.output/peaks/* bwa.output/macs.output/peaks.sig2/.
gunzip bwa.output/macs.output/peaks.sig2/*
cd bwa.output/macs.output/peaks.sig2
for i in *.bed; do
awk '{if ($7 >= 2) print $0}' $i > $i.sig2.bed
done
rm *.R1.macs2.bed
gzip *macs2.bed.sig2.bed
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files
sed 's/peaks/peaks.sig2/g' diffbind.macs2.outlier.sample.csv | sed 's/macs2.bed/macs2.bed.sig2.bed/g' > diffbind.macs2.sig2.outlier.sample.csv
# R
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.macs2.sig2.outlier.sample.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.macs2.sig2.outlier.sample.csv")
head(df$peaks[[1]])
write.table(df$peaks[[1]], "diffbind.MACS2.sig2.peaks.txt", quote=F, row.names=F, sep="\t")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
write.table(df.counted$peaks[[154]], "diffbind.MACS2.sig2.consensus.glu.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[153]], "diffbind.MACS2.sig2.consensus.glu.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[152]], "diffbind.MACS2.sig2.consensus.gaba.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[151]], "diffbind.MACS2.sig2.consensus.gaba.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[150]], "diffbind.MACS2.sig2.consensus.olig.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[149]], "diffbind.MACS2.sig2.consensus.olig.control.txt", quote=F, row.names=F, sep="\t")
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
df_consensus
# 6 Samples, 94312 sites in matrix (117273 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 38028
# 2 olig:heroin olig heroin heroin 1 46005
# 3 gaba:control gaba control control 1 51282
# 4 gaba:heroin gaba heroin heroin 1 47287
# 5 glu:control glu control control 1 83812
# 6 glu:heroin glu heroin heroin 1 78568
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.66)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
df.condition.consensus
# 2 Samples, 31840 sites in matrix (37420 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 34169
# 2 heroin olig-gaba-glu heroin heroin 1 35380
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.66)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
df.cell.consensus
# 3 Samples, 43495 sites in matrix (105949 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 40578
# 2 gaba gaba control-heroin control-heroin 1 50061
# 3 glu glu control-heroin control-heroin 1 82168
pdf("consensus.MACS2.sig2.condition.overlap.pdf")
dba.plotVenn(df.condition,df.condition$masks$Consensus)
dev.off()
pdf("consensus.MACS2.sig2.cell.overlap.pdf")
dba.plotVenn(df.cell,df.cell$masks$Consensus)
dev.off()
## use 33% overlap instead of 66%
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.33)
df.counted
df_consensus = dba(df.counted, mask = df.counted$masks$Consensus)
df_consensus
# 6 Samples, 169629 sites in matrix (212458 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig:control olig control control 1 75094
# 2 olig:heroin olig heroin heroin 1 87373
# 3 gaba:control gaba control control 1 94952
# 4 gaba:heroin gaba heroin heroin 1 89116
# 5 glu:control glu control control 1 135861
# 6 glu:heroin glu heroin heroin 1 132216
df.condition = dba.peakset(df, consensus = c(DBA_CONDITION), minOverlap=0.33)
df.condition.consensus <- dba(df.condition, mask = df.condition$masks$Consensus)
df.condition.consensus
# 2 Samples, 84834 sites in matrix (106499 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 control olig-gaba-glu control control 1 97603
# 2 heroin olig-gaba-glu heroin heroin 1 95348
df.cell = dba.peakset(df, consensus = c(DBA_TISSUE), minOverlap=0.33)
df.cell.consensus <- dba(df.cell, mask = df.cell$masks$Consensus)
df.cell.consensus
# 3 Samples, 75203 sites in matrix (187718 total):
# ID Tissue Condition Treatment Replicate Intervals
# 1 olig olig control-heroin control-heroin 1 78129
# 2 gaba gaba control-heroin control-heroin 1 88742
# 3 glu glu control-heroin control-heroin 1 134587
https://genometric.github.io/MSPC/docs/cli/input
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
wget -O linux-x64.zip https://github.com/Genometric/MSPC/releases/latest/download/linux-x64.zip
unzip -q linux-x64.zip -d mspc && cd mspc && chmod +x mspc
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks
gunzip *.gz
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mspc/mspc -i *.bed -r bio -w 1e-4 -s 1e-8
## Too much memory for base node? Submit as slurm job on andes
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J mspc.peaks
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mspc/mspc -i *.bed -r bio -w 1e-4 -s 1e-8
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/mspc.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks/session_20210903_165201856
mkdir -p ../mspc.consensus || exit 1
for pathname in "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks/session_20210907_104822678"/*/Confirmed.bed; do
cp "$pathname" "../mspc.consensus/$( basename "$( dirname "$pathname" )" ).bed"
done
gzip /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks/*.bed
https://bcbio-nextgen.readthedocs.io/en/latest/contents/atac.html#output
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
wget https://raw.github.com/bcbio/bcbio-nextgen/master/scripts/bcbio_nextgen_install.py
python bcbio_nextgen_install.py bcbio_installation_path --tooldir=tools_installation_path --nodata
bcbio_nextgen.py upgrade -u skip --genomes hg38 --aligners bwa
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
mkdir bcbio
mkdir bcbio/fastq
ln -s /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/trimmed/* bcbio/fastq
mkdir -p bcbio/metadata
cd bcbio
wget --no-check-certificate http://s3.amazonaws.com/bcbio-nextgen/atac_userstory_data/atac-example.yaml -O metadata/atac-example.yaml
# make sample sheet
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac")
df <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
df2 <- df[,c(1,1,3,5)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/metadata")
write.table(df2, "atac.meta.csv", quote=F, row.names=F, col.names=F, sep=",")
# generate yaml config
bcbio_nextgen.py -w template metadata/atac-opioid.yaml metadata/atac.meta.csv fastq
# run analysis
cd hindbrain_forebrain/work
bcbio_nextgen.py ../config/hindbrain_forebrain.yaml -n 16
https://github.com/jsh58/Genrich#method
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
#wget https://github.com/jsh58/Genrich
#conda install -c bioconda genrich
conda create --prefix /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/genrich -c bioconda genrich
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J genrich
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/genrich
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam
Genrich -t GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-288_TAGGCATG-TATCCTCT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-309_ACTGAGCG-TATCCTCT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-316_ACTGAGCG-CGTCTAAT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-322_AGGCAGAA-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-339_AGGCAGAA-CTATTAAG_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-340_ACTGAGCG-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-345_TAAGGCGA-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-349_GGAGCTAC-CGTCTAAT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-357_TAGCGCTC-ACTGCATA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-368_CGTACTAG-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-370_TAAGGCGA-CCTAGAGT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-372_GGACTCCT-TCGACTAG_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-377_TGCAGCTA-CTCTCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-383_CGTACTAG-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-405_CGAGGCTG-GAGCCTTA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-406_CGTACTAG-TCTCTCCG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-408_CGAGGCTG-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-413_GCTCATGA-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-430_AAGAGGCA-CTATTAAG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-444_AGGCAGAA-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S00-0255_ATGCGCAG-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S03-0019_AAGAGGCA-GTAAGGAG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S05-0105_ACTGAGCG-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S05-0252_TACGCTGC-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S07-0156_TAGCGCTC-TTATGCGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S08-0061_ACTGAGCG-TTATGCGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S11-0310_ACTGAGCG-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S13-0098_AAGAGGCA-CTAAGCCT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-288_TCCTGAGC-GAGCCTTA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-309_TAGGCATG-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-316_ACTGAGCG-CTCTCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-322_AGGCAGAA-CCTAGAGT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-339_TAAGGCGA-TCGACTAG_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-340_TCCTGAGC-GCGTAAGA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-345_TAAGGCGA-AAGGCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-349_GGAGCTAC-CTAAGCCT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-357_TAGCGCTC-CTAAGCCT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-368_GCGTAGTA-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-370_CGTACTAG-AAGGCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-372_GGACTCCT-CTATTAAG_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-377_TGCAGCTA-GTAAGGAG_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-383_TCCTGAGC-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-405_CGAGGCTG-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-406_AAGAGGCA-CTCTCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-408_CGAGGCTG-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-413_GCTCATGA-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-430_CTCTCTAC-CTATTAAG_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-444_AGGCAGAA-GAGCCTTA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S00-0255_ATGCGCAG-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S03-0019_AAGAGGCA-ACTGCATA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S05-0105_ACTGAGCG-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S05-0252_TACGCTGC-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S07-0156_TAGCGCTC-GAGCCTTA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S08-0061_TGCAGCTA-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S11-0310_TACGCTGC-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S13-0098_AAGAGGCA-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-288_TCCTGAGC-TCTCTCCG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-309_TAAGGCGA-GAGCCTTA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-316_ACTGAGCG-TCTCTCCG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-322_AGGCAGAA-TTCTAGCT_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-339_ACTGAGCG-GTAAGGAG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-340_GTAGAGGA-TTCTAGCT_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-345_TAAGGCGA-CTATTAAG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-349_GGAGCTAC-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-357_TAGCGCTC-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-368_CGTACTAG-CGTCTAAT_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-370_GTAGAGGA-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-372_ACTGAGCG-ACTGCATA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-377_TGCAGCTA-ACTGCATA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-383_TCGACGTC-AAGGAGTA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-405_CGAGGCTG-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-406_AAGAGGCA-TATCCTCT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-408_CGAGGCTG-TTATGCGA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-413_TAAGGCGA-ACTGCATA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-430_CTCTCTAC-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-444_CGAGGCTG-CTATTAAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S00-0255_ACTGAGCG-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S03-0019_AAGAGGCA-AAGGAGTA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S05-0105_ACTGAGCG-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S05-0252_TACGCTGC-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S07-0156_ATGCGCAG-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S08-0061_TGCAGCTA-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S11-0310_TAGCGCTC-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S13-0098_GTAGAGGA-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,pool-A-257-GABA_TCGACGTC-TTATGCGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-257-GLU_TCGACGTC-AAGGCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-257-OLIG_TCGACGTC-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-260-GABA_TAAGGCGA-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-260-GLU_TAAGGCGA-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-260-OLIG_ATCTCAGG-GTAAGGAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-265-GABA_AGGCAGAA-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-265-GLU_CGAGGCTG-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-265-OLIG_ATCTCAGG-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-286-GABA_TAAGGCGA-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-286-GLU_TCCTGAGC-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-286-OLIG_GGACTCCT-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-319-GABA_TCGACGTC-CTATTAAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-319-GLU_CGATCAGT-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-319-OLIG_CGATCAGT-TTATGCGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-332-GABA_TAGGCATG-TTCTAGCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-332-GLU_CCTAAGAC-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-332-OLIG_GCGTAGTA-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-338-GABA_TAGCGCTC-TATCCTCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-338-GLU_GGAGCTAC-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-338-OLIG_TAGCGCTC-ACTGCATA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-365-GABA_CGGAGCCT-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-365-GLU_CGGAGCCT-CTCTCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-365-OLIG_CGGAGCCT-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-376-GABA_CTCTCTAC-CCTAGAGT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-376-GLU_CTCTCTAC-TTATGCGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-376-OLIG_CGGAGCCT-TATCCTCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-387-GABA_TACGCTGC-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-387-GLU_CGGAGCCT-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-387-OLIG_CGGAGCCT-AAGGAGTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-394-GABA_CTCTCTAC-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-394-GLU_CTCTCTAC-TCGACTAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-394-OLIG_GCTCATGA-TTCTAGCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-395-GABA_TCGACGTC-ACTGCATA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-395-GLU_TCGACGTC-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-395-OLIG_TCGACGTC-CTCTCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-398-GABA_CCTAAGAC-TTATGCGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-398-GLU_GTAGAGGA-GCGTAAGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-398-OLIG_GTAGAGGA-CTATTAAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-407-GABA_CCTAAGAC-CTCTCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-407-GLU_TACGCTGC-AAGGAGTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-407-OLIG_ATGCGCAG-AAGGAGTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-427-GABA_ATCTCAGG-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-427-GLU_ATCTCAGG-ACTGCATA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-427-OLIG_ATCTCAGG-TATCCTCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-B-255-GABA_CGATCAGT-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-255-GLU_TGCAGCTA-TTATGCGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-255-OLIG_CGATCAGT-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-310-GABA_TACGCTGC-TATCCTCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-310-GLU_TACGCTGC-ACTGCATA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-310-OLIG_TACGCTGC-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-325-GABA_GTAGAGGA-TTCTAGCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-325-GLU_ATGCGCAG-ACTGCATA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-325-OLIG_GTAGAGGA-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-333-GABA_CGATCAGT-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-333-GLU_CGATCAGT-TCTCTCCG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-333-OLIG_CGATCAGT-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-336-GABA_AAGAGGCA-TCGACTAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-336-GLU_AAGAGGCA-GAGCCTTA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-336-OLIG_TGCAGCTA-CGTCTAAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-343-GABA_CCTAAGAC-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-343-GLU_CCTAAGAC-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-343-OLIG_CCTAAGAC-AAGGAGTA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-354-GABA_CGTACTAG-TTATGCGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-354-GLU_CGTACTAG-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-354-OLIG_TAGGCATG-GCGTAAGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-375-GABA_ACTCGCTA-CTAAGCCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-375-GLU_TAGCGCTC-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-375-OLIG_TAGCGCTC-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-382-GABA_TAAGGCGA-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-382-GLU_TAAGGCGA-TATCCTCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-382-OLIG_ATCTCAGG-CTAAGCCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-385-GABA_GCTCATGA-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-385-GLU_AAGAGGCA-CCTAGAGT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-385-OLIG_AAGAGGCA-TTATGCGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-386-GABA_TGCAGCTA-GAGCCTTA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-386-GLU_TGCAGCTA-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-386-OLIG_TGCAGCTA-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-388-GABA_TAGGCATG-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-388-GLU_TCCTGAGC-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-389-GABA_GGAGCTAC-ACTGCATA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-389-GLU_GGAGCTAC-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-389-OLIG_GGAGCTAC-TATCCTCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-397-GABA_ACTCGCTA-TCTCTCCG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-397-GLU_ACTCGCTA-AAGGAGTA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-397-OLIG_ACTCGCTA-CGTCTAAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-401-GABA_ATGCGCAG-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-401-GLU_CGTACTAG-TCGACTAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-401-OLIG_ATGCGCAG-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam -f genrich.log -o genrich.out.narrowPeak -j -y -r -e chrM -v
#Genrich -t <BAM> -f <LOG> -o <OUT> -j -y -r -e chrM -v
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/genrich.sh
–> calculate value for each sample for each consensus peak…
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J genrich.consensus
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam
#sed '1d' genrich.log > genrich.bp.bed
cut -f 1-183 genrich.bp.bed > genrich.bp.cut.bed
bedtools intersect -wo -a genrich.out.narrowPeak -b genrich.bp.cut.bed > genrich.peak.pval.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
R CMD BATCH genrich.consensus.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/genrich.consensus.sh
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam
sed '1d' genrich.log > genrich.bp.bed
cut -f 1-183 genrich.bp.bed > genrich.bp.cut.bed
bedtools intersect -wo -a genrich.out.narrowPeak -b genrich.bp.cut.bed > genrich.peak.pval.txt
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam")
df <- read.delim("genrich.peak.pval.txt", header=F, sep="\t")
df.mean <- df %>% group_by(V4) %>% summarise_all("mean")
colnames(df.mean) <- c("Chr","Start","End","Peak","GABA-276","GABA-288","GABA-302","GABA-309","GABA-316","GABA-322","GABA-339","GABA-340","GABA-345","GABA-349","GABA-357","GABA-368","GABA-370","GABA-372","GABA-377","GABA-383","GABA-405m","GABA-406","GABA-408","GABA-413","GABA-430","GABA-444","GABA-S00-0255","GABA-S03-0019","GABA-S05-0105","GABA-S05-0252","GABA-S07-0156","GABA-S08-0061","GABA-S11-0310","GABA-S13-0098","GLU-276","GLU-288","GLU-302","GLU-309","GLU-316","GLU-322","GLU-339","GLU-340","GLU-345","GLU-349","GLU-357","GLU-368","GLU-370","GLU-372","GLU-377","GLU-383","GLU-405","GLU-406","GLU-408","GLU-413","GLU-430","GLU-444","GLU-S00-0255","GLU-S03-0019","GLU-S05-0105","GLU-S05-0252","GLU-S07-0156","GLU-S08-0061","GLU-S11-0310","GLU-S13-0098","OLIG-276","OLIG-288","OLIG-302","OLIG-309","OLIG-316","OLIG-322","OLIG-339","OLIG-340","OLIG-345","OLIG-349","OLIG-357","OLIG-368","OLIG-370","OLIG-372","OLIG-377","OLIG-383","OLIG-405","OLIG-406","OLIG-408","OLIG-413","OLIG-430","OLIG-444","OLIG-S00-0255","OLIG-S03-0019","OLIG-S05-0105","OLIG-S05-0252","OLIG-S07-0156","OLIG-S08-0061","OLIG-S11-0310","OLIG-S13-0098","pool-A-257-GABA","pool-A-257-GLU","pool-A-257-OLIG","pool-A-260-GABA","pool-A-260-GLU","pool-A-260-OLIG","pool-A-265-GABA","pool-A-265-GLU","pool-A-265-OLIG","pool-A-286-GABA","pool-A-286-GLU","pool-A-286-OLIG","pool-A-319-GABA","pool-A-319-GLU","pool-A-319-OLIG","pool-A-332-GABA","pool-A-332-GLU","pool-A-332-OLIG","pool-A-338-GABA","pool-A-338-GLU","pool-A-338-OLIG","pool-A-365-GABA","pool-A-365-GLU","pool-A-365-OLIG","pool-A-376-GABA","pool-A-376-GLU","pool-A-376-OLIG","pool-A-387-GABA","pool-A-387-GLU","pool-A-387-OLIG","pool-A-394-GABA","pool-A-394-GLU","pool-A-394-OLIG","pool-A-395-GABA","pool-A-395-GLU","pool-A-395-OLIG","pool-A-398-GABA","pool-A-398-GLU","pool-A-398-OLIG","pool-A-407-GABA","pool-A-407-GLU","pool-A-407-OLIG","pool-A-427-GABA","pool-A-427-GLU","pool-A-427-OLIG","pool-B-255-GABA","pool-B-255-GLU","pool-B-255-OLIG","pool-B-310-GABA","pool-B-310-GLU","pool-B-310-OLIG","pool-B-325-GABA","pool-B-325-GLU","pool-B-325-OLIG","pool-B-333-GABA","pool-B-333-GLU","pool-B-333-OLIG","pool-B-336-GABA","pool-B-336-GLU","pool-B-336-OLIG","pool-B-343-GABA","pool-B-343-GLU","pool-B-343-OLIG","pool-B-354-GABA","pool-B-354-GLU","pool-B-354-OLIG","pool-B-375-GABA","pool-B-375-GLU","pool-B-375-OLIG","pool-B-382-GABA","pool-B-382-GLU","pool-B-382-OLIG","pool-B-385-GABA","pool-B-385-GLU","pool-B-385-OLIG","pool-B-386-GABA","pool-B-386-GLU","pool-B-386-OLIG","pool-B-388-GABA","pool-B-388-GLU","pool-B-389-GABA","pool-B-389-GLU","pool-B-389-OLIG","pool-B-397-GABA","pool-B-397-GLU","pool-B-397-OLIG","pool-B-401-GABA","pool-B-401-GLU","pool-B-401-OLIG")
write.table(df.mean, "genrich.peak.pval.sample.matrix.txt", quote=F, row.names=F, sep="\t")
–> run for each subset (tissue/celltype)
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J sub.genrich
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/genrich
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam
Genrich -t GABA-276_TAGGCATG-GCGTAAGA_HGHM2DSXY_L003_001.R1.rmdups.bam,GABA-372_GGACTCCT-TCGACTAG_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-405_CGAGGCTG-GAGCCTTA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-406_CGTACTAG-TCTCTCCG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-430_AAGAGGCA-CTATTAAG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-444_AGGCAGAA-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S00-0255_ATGCGCAG-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S03-0019_AAGAGGCA-GTAAGGAG_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S05-0105_ACTGAGCG-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S08-0061_ACTGAGCG-TTATGCGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S11-0310_ACTGAGCG-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-S13-0098_AAGAGGCA-CTAAGCCT_HGHM2DSXY_L004_001.R1.bwa.sam,pool-A-257-GABA_TCGACGTC-TTATGCGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-260-GABA_TAAGGCGA-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-265-GABA_AGGCAGAA-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-286-GABA_TAAGGCGA-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-319-GABA_TCGACGTC-CTATTAAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-332-GABA_TAGGCATG-TTCTAGCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-338-GABA_TAGCGCTC-TATCCTCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-365-GABA_CGGAGCCT-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-376-GABA_CTCTCTAC-CCTAGAGT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-387-GABA_TACGCTGC-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-394-GABA_CTCTCTAC-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-395-GABA_TCGACGTC-ACTGCATA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-407-GABA_CCTAAGAC-CTCTCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-427-GABA_ATCTCAGG-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam -f genrich.gaba.control.log -o genrich.gaba.control.narrowPeak -j -y -r -e chrM -v
Genrich -t GABA-302_TCCTGAGC-TATCCTCT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-316_ACTGAGCG-CGTCTAAT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-322_AGGCAGAA-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-339_AGGCAGAA-CTATTAAG_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-340_ACTGAGCG-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-345_TAAGGCGA-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-357_TAGCGCTC-ACTGCATA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-368_CGTACTAG-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-370_TAAGGCGA-CCTAGAGT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-377_TGCAGCTA-CTCTCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GABA-383_CGTACTAG-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GABA-408_CGAGGCTG-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,pool-B-255-GABA_CGATCAGT-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-310-GABA_TACGCTGC-TATCCTCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-325-GABA_GTAGAGGA-TTCTAGCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-333-GABA_CGATCAGT-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-336-GABA_AAGAGGCA-TCGACTAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-343-GABA_CCTAAGAC-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-354-GABA_CGTACTAG-TTATGCGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-382-GABA_TAAGGCGA-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-385-GABA_GCTCATGA-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-388-GABA_TAGGCATG-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-389-GABA_GGAGCTAC-ACTGCATA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-401-GABA_ATGCGCAG-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam -f genrich.gaba.heroin.log -o genrich.gaba.heroin.narrowPeak -j -y -r -e chrM -v
Genrich -t GLU-276_TAGGCATG-GAGCCTTA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-288_TCCTGAGC-GAGCCTTA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-372_GGACTCCT-CTATTAAG_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-405_CGAGGCTG-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-430_CTCTCTAC-CTATTAAG_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-444_AGGCAGAA-GAGCCTTA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S03-0019_AAGAGGCA-ACTGCATA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S05-0105_ACTGAGCG-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S05-0252_TACGCTGC-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S08-0061_TGCAGCTA-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-S13-0098_AAGAGGCA-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,pool-A-257-GLU_TCGACGTC-AAGGCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-260-GLU_TAAGGCGA-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-265-GLU_CGAGGCTG-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-286-GLU_TCCTGAGC-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-319-GLU_CGATCAGT-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-332-GLU_CCTAAGAC-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-338-GLU_GGAGCTAC-TCTCTCCG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-365-GLU_CGGAGCCT-CTCTCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-376-GLU_CTCTCTAC-TTATGCGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-387-GLU_CGGAGCCT-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-394-GLU_CTCTCTAC-TCGACTAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-395-GLU_TCGACGTC-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-398-GLU_GTAGAGGA-GCGTAAGA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-407-GLU_TACGCTGC-AAGGAGTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-427-GLU_ATCTCAGG-ACTGCATA_H3W5FDSXY_L001_001.R1.bwa.sam -f genrich.glu.control.log -o genrich.glu.control.narrowPeak -j -y -r -e chrM -v
Genrich -t GLU-302_TCCTGAGC-AAGGCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-316_ACTGAGCG-CTCTCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-322_AGGCAGAA-CCTAGAGT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-339_TAAGGCGA-TCGACTAG_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-340_TCCTGAGC-GCGTAAGA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-345_TAAGGCGA-AAGGCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-349_GGAGCTAC-CTAAGCCT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-357_TAGCGCTC-CTAAGCCT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-368_GCGTAGTA-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-370_CGTACTAG-AAGGCTAT_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-377_TGCAGCTA-GTAAGGAG_HGHM2DSXY_L003_001.R1.bwa.sam,GLU-383_TCCTGAGC-GCGTAAGA_HGHM2DSXY_L004_001.R1.bwa.sam,GLU-408_CGAGGCTG-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,pool-B-255-GLU_TGCAGCTA-TTATGCGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-310-GLU_TACGCTGC-ACTGCATA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-325-GLU_ATGCGCAG-ACTGCATA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-333-GLU_CGATCAGT-TCTCTCCG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-336-GLU_AAGAGGCA-GAGCCTTA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-343-GLU_CCTAAGAC-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-354-GLU_CGTACTAG-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-382-GLU_TAAGGCGA-TATCCTCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-385-GLU_AAGAGGCA-CCTAGAGT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-386-GLU_TGCAGCTA-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-389-GLU_GGAGCTAC-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-401-GLU_CGTACTAG-TCGACTAG_H3W5FDSXY_L002_001.R1.bwa.sam -f genrich.glu.heroin.log -o genrich.glu.heroin.narrowPeak -j -y -r -e chrM -v
Genrich -t OLIG-276_TAGGCATG-TCTCTCCG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-288_TCCTGAGC-TCTCTCCG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-372_ACTGAGCG-ACTGCATA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-405_CGAGGCTG-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-430_CTCTCTAC-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-444_CGAGGCTG-CTATTAAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S00-0255_ACTGAGCG-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S03-0019_AAGAGGCA-AAGGAGTA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S05-0105_ACTGAGCG-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S05-0252_TACGCTGC-CCTAGAGT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S08-0061_TGCAGCTA-TTCTAGCT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S11-0310_TAGCGCTC-AAGGCTAT_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-S13-0098_GTAGAGGA-TCGACTAG_HGHM2DSXY_L004_001.R1.bwa.sam,pool-A-257-OLIG_TCGACGTC-GAGCCTTA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-260-OLIG_ATCTCAGG-GTAAGGAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-265-OLIG_ATCTCAGG-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-286-OLIG_GGACTCCT-CTAAGCCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-332-OLIG_GCGTAGTA-CGTCTAAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-338-OLIG_TAGCGCTC-ACTGCATA_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-376-OLIG_CGGAGCCT-TATCCTCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-394-OLIG_GCTCATGA-TTCTAGCT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-395-OLIG_TCGACGTC-CTCTCTAT_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-398-OLIG_GTAGAGGA-CTATTAAG_H3W5FDSXY_L001_001.R1.bwa.sam,pool-A-427-OLIG_ATCTCAGG-TATCCTCT_H3W5FDSXY_L001_001.R1.bwa.sam -f genrich.olig.control.log -o genrich.olig.control.narrowPeak -j -y -r -e chrM -v
Genrich -t OLIG-302_TCCTGAGC-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-316_ACTGAGCG-TCTCTCCG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-322_AGGCAGAA-TTCTAGCT_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-339_ACTGAGCG-GTAAGGAG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-340_GTAGAGGA-TTCTAGCT_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-345_TAAGGCGA-CTATTAAG_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-349_GGAGCTAC-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-357_TAGCGCTC-AAGGAGTA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-368_CGTACTAG-CGTCTAAT_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-370_GTAGAGGA-TTATGCGA_HGHM2DSXY_L003_001.R1.bwa.sam,OLIG-383_TCGACGTC-AAGGAGTA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-408_CGAGGCTG-TTATGCGA_HGHM2DSXY_L004_001.R1.bwa.sam,OLIG-413_TAAGGCGA-ACTGCATA_HGHM2DSXY_L004_001.R1.bwa.sam,pool-B-255-OLIG_CGATCAGT-CTATTAAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-310-OLIG_TACGCTGC-GTAAGGAG_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-325-OLIG_GTAGAGGA-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-333-OLIG_CGATCAGT-CTCTCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-336-OLIG_TGCAGCTA-CGTCTAAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-343-OLIG_CCTAAGAC-AAGGAGTA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-382-OLIG_ATCTCAGG-CTAAGCCT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-385-OLIG_AAGAGGCA-TTATGCGA_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-386-OLIG_TGCAGCTA-AAGGCTAT_H3W5FDSXY_L002_001.R1.bwa.sam,pool-B-389-OLIG_GGAGCTAC-TATCCTCT_H3W5FDSXY_L002_001.R1.bwa.sam -f genrich.olig.heroin.log -o genrich.olig.heroin.narrowPeak -j -y -r -e chrM -v
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/genrich.subsets.sh
https://academic.oup.com/nargab/article/3/3/lqab059/6313252#267123242 https://bioconductor.org/packages/release/bioc/html/ROTS.html
## Consensus peak matrix
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.macs2.outlier.sample.csv")
names(samples)
df <- dba(sampleSheet="key.files/diffbind.macs2.outlier.sample.csv")
#df.counted <- dba.count(df, summits=250)
df.counted <- dba.count(df)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
counts <- dba.peakset(df.counted, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.narrowpeak.peakset.counts.txt", sep="\t", quote=F, row.names=F)
### Differential - Condition + Tissue
df.counted <- dba.contrast(df.counted, design="~Tissue + Condition")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
#DESeq2
df.analysed <- dba.analyze(df.counted)
#EdgeR
df.analysed <- dba.analyze(df.counted, method=DBA_EDGER)
#Both methods simultaneously
df.analysed <- dba.analyze(df.counted, method=DBA_ALL_METHODS)
dba.show(df.analysed, bContrasts=T)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
counts <- dba.peakset(df.analysed, bRetrieve=T, DataType=DBA_DATA_FRAME)
write.table(counts, "all.outliers.peakset.narrowpeak.differential.condition.tissue.counts.txt", sep="\t", quote=F, row.names=F)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
pdf("dba.macs2.differential.venn.outliers.narrowpeak.condition.tissue.pdf")
dba.plotVenn(df.analysed,contrast=1,method=DBA_ALL_METHODS)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
pdf("dba.macs2.differential.heatmap.outliers.narrowpeak.condition.tissue.pdf")
dba.plotHeatmap(df.analysed, contrast=1)
dev.off()
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
pdf("dba.macs2.differential.heatmap2.outliers.narrowpeak.condition.pdf")
dba.plotHeatmap(df.analysed, ColAttributes = DBA_CONDITION, contrast=1, correlations=FALSE)
dev.off()
pdf("dba.macs2.plotMA.outliers.narrowpeak.condition.tissue.pdf")
dba.plotMA(df.analysed)
dev.off()
pdf("dba.macs2.plotVolcano.outliers.narrowpeak.condition.tissue.pdf")
dba.plotVolcano(df.analysed)
dev.off()
pdf("dba.macs2.plotPCA.outliers.narrowpeak.condition.tissue.pdf")
dba.plotPCA(df.analysed, contrast = 1)
dev.off()
pdf("dba.macs2.plotBox.outliers.narrowpeak.condition.tissue.pdf")
dba.plotBox(df.analysed)
dev.off()
report <- dba.report(df.analysed)
report
report.df <- as.data.frame(report)
write.table(report.df, "peaks.report.mac2.all.narrowpeak.outliers.condition.tissue.csv", sep="\t", quote=F, row.names=F)
### ROTS
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ROTS")
#ROTS(data, groups, B = 1000, K = NULL, paired = FALSE, seed = NULL, a1 = NULL, a2 = NULL, log = TRUE, progress = FALSE, verbose = TRUE)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
data <- read.delim("all.outliers.narrowpeak.peakset.counts.txt", header=T, sep="\t")
data.gaba <- data[,]
data.glu <- data[,]
data.olig <- data[,]
# GABA
groups = c(rep(0,3), rep(1,3))
results = ROTS(data = data.gaba, groups = groups , B = 1000 , seed = 1234)
names(results)
summary(results, fdr = 0.05)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
pdf("rots.gaba.results.volcano.pdf")
plot(results, fdr = 0.05, type = "volcano")
dev.off()
pdf("rots.gaba.results.heatmap.pdf")
plot(results, fdr = 0.05, type = "heatmap")
dev.off()
# GLU
groups = c(rep(0,3), rep(1,3))
results = ROTS(data = data.glu, groups = groups , B = 1000 , seed = 1234)
names(results)
summary(results, fdr = 0.05)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
pdf("rots.glu.results.volcano.pdf")
plot(results, fdr = 0.05, type = "volcano")
dev.off()
pdf("rots.glu.results.heatmap.pdf")
plot(results, fdr = 0.05, type = "heatmap")
dev.off()
# OLIG
groups = c(rep(0,3), rep(1,3))
results = ROTS(data = data.olig, groups = groups , B = 1000 , seed = 1234)
names(results)
summary(results, fdr = 0.05)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks")
pdf("rots.olig.results.volcano.pdf")
plot(results, fdr = 0.05, type = "volcano")
dev.off()
pdf("rots.olig.results.heatmap.pdf")
plot(results, fdr = 0.05, type = "heatmap")
dev.off()
–> use code from paper https://github.com/elolab/Faux_et_al_NARGAB2021/blob/main/Differential_peak_calling%20scripts/YF_ATAC/ROTS/code/readcount.R
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J rots.difbind
#SBATCH -N 2
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
R CMD BATCH rots.diffbind.R
R CMD BATCH rots.diffbind.gaba.R
R CMD BATCH rots.diffbind.glu.R
R CMD BATCH rots.diffbind.olig.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/rots.diffbind.sh
# salloc -A SYB105 -p gpu -N 2 -t 2:00:00
library(DiffBind)
library(DESeq2)
library(edgeR)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
sheet <- dba(sampleSheet="key.files/diffbind.macs2.outlier.sample.csv")
print("data loading done")
olap.rate <- dba.overlap(sheet,mode=DBA_OLAP_RATE)
jpeg("overlapRateATAC.jpeg")
plot(olap.rate,type='b',ylab='# peaks', xlab='Overlap at least this many peaksets')
dev.off()
peakset <- dba.peakset(sheet, consensus=DBA_CONDITION)
jpeg("peakOverlapATAC.jpeg")
dba.plotVenn(peakset,peakset$masks$Consensus)
dev.off()
ATAC.count <- dba.count(sheet)
save(ATAC.count,file="ATAC.count.RData")
print("count done")
ATAC.contrast <- dba.contrast(ATAC.count, categories=DBA_CONDITION)
save(ATAC.contrast,file="ATAC.contrast.RData")
print("contrast done")
ATAC.analysed <- dba.analyze(ATAC.contrast,bParallel=FALSE,method=c(DBA_DESEQ2,DBA_EDGER))
save(ATAC.analysed,file="ATAC.analysed_narrow.RData")
print("analyze done")
report <- dba.report(ATAC.analysed, th=.05, DataType=DBA_DATA_FRAME,method=DBA_EDGER)
score <- -10*(log10(report$FDR))
write.table(cbind(report[,1:3],rownames(report),score),
"DBsites_YF_edger_narrow.bed", quote=FALSE, sep="\t",
row.names=FALSE, col.names=FALSE)
report <- dba.report(ATAC.analysed, th=1, DataType=DBA_DATA_FRAME,method=DBA_DESEQ2)
score <- report$FDR
FC <- report$Fold
write.table(cbind(report[,1:3],rownames(report),score,FC),
"DBsites_YF_deseq2_narrow.bed", quote=FALSE, sep="\t",
row.names=FALSE, col.names=FALSE)
print("report done")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts
# make ROTS_input.csv file with consensus peaks and bam directories
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("diffbind.macs2.outlier.sample.csv", header=T, sep=",")
gaba.control <- subset(df, df$Condition == "control" & df$Tissue == "gaba")
gaba.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "gaba")
glu.control <- subset(df, df$Condition == "control" & df$Tissue == "glu")
glu.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "glu")
olig.control <- subset(df, df$Condition == "control" & df$Tissue == "olig")
olig.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "olig")
gaba.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.control.bed"
gaba.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.heroin.bed"
glu.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.glu.control.bed"
glu.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.glu.heroin.bed"
olig.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.olig.control.bed"
olig.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.olig.heroin.bed"
gaba.control$rep <- 1
gaba.heroin$rep <- 2
glu.control$rep <- 1
glu.heroin$rep <- 2
olig.control$rep <- 1
olig.heroin$rep <- 2
gaba.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.control$bamReads)
gaba.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.heroin$bamReads)
glu.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", glu.control$bamReads)
glu.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", glu.heroin$bamReads)
olig.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", olig.control$bamReads)
olig.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", olig.heroin$bamReads)
gaba <- rbind(gaba.control[,c(12,14,13)],gaba.heroin[,c(12,14,13)])
glu <- rbind(glu.control[,c(12,14,13)],glu.heroin[,c(12,14,13)])
olig <- rbind(olig.control[,c(12,14,13)],olig.heroin[,c(12,14,13)])
gaba.glu <- rbind(gaba, glu)
gaba.glu.olig <- rbind(gaba.glu, olig)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
write.table(gaba.glu.olig, "ROTS_input.csv", quote=F, row.names=F, col.names=F, sep=";")
write.table(gaba, "ROTS_input_gaba.csv", quote=F, row.names=F, col.names=F, sep=";")
write.table(glu, "ROTS_input_glu.csv", quote=F, row.names=F, col.names=F, sep=";")
write.table(olig, "ROTS_input_olig.csv", quote=F, row.names=F, col.names=F, sep=";")
## run on macs2 peaks (diffbind consensus)
#### make sure to install all R packages within conda environment before submitting (ROTS, GenomicRanges, rtracklayer, Rsubread, Rsamtools, ade4, made4)
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/Rscriptandes.sh
** need to run separately for each... has to have the file name ROTS_input.csv
Object_holder2 <- differentialCall(Object_holder, B=100, K=floor(nrow(Object_holder$Filtered_NormCounts)/2), seed = 14,paired = FALSE, normalized = TRUE)
save(Object_holder2, file = "object_lower1_pfalse_glu.RData")
results <-outputGeneration(Object_holder2,fdr=1)
write.table(results,"myresults_lower1_pfalse_glu.bed",quote = FALSE,row.names = FALSE)
save(Object_holder2,file = "object_lower1_pfalse2_glu.RData")
Object_holder2 <- differentialCall(Object_holder, B=100, K=floor(nrow(Object_holder$Filtered_NormCounts)/2), seed = 14,paired = FALSE, normalized = TRUE)
save(Object_holder2, file = "object_lower1_pfalse_olig.RData")
results <-outputGeneration(Object_holder2,fdr=1)
write.table(results,"myresults_lower1_pfalse_olig.bed",quote = FALSE,row.names = FALSE)
save(Object_holder2,file = "object_lower1_pfalse2_olig.RData")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("myresults_lower1_pfalse_gaba.bed", header=T, sep=" ")
glu <- read.delim("myresults_lower1_pfalse_glu.bed", header=T, sep=" ")
olig <- read.delim("myresults_lower1_pfalse_olig.bed", header=T, sep=" ")
## negative logfc = higher in heroin
summary(gaba$logfc)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-0.991887 -0.141343 -0.010064 -0.004555 0.128011 1.108429
summary(glu$logfc)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-0.803101 -0.137979 -0.006395 -0.001905 0.130659 1.252575
summary(olig$logfc)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
#-0.98568 -0.18167 -0.04294 -0.01897 0.13050 1.01096
nrow(subset(gaba, gaba$logfc > 0.5))
# 385 gaba peaks are differential (higher in control) --> 385/571=68%
nrow(subset(gaba, gaba$logfc < -0.5))
# 186 gaba peaks are differential (higher in heroin)
nrow(subset(glu, glu$logfc > 0.5))
# 601 glu peaks are differential (higher in control) --> 57%
nrow(subset(glu, glu$logfc < -0.5))
# 452 glu peaks are differential (higher in heroin)
nrow(subset(olig, olig$logfc > 0.5))
# 928 olig peaks are differential (higher in control) --> 66%
nrow(subset(olig, olig$logfc < -0.5))
# 486 olig peaks are differential (higher in heroin)
#### what is causing the bias towards more differential peaks in control?
# --> not due to mapping rates...
# --> more prominent in differential than in overall peaks
nrow(subset(gaba, gaba$logfc > 0))
# 26100 gaba peaks are higher in control (26100/54332=48%)
nrow(subset(gaba, gaba$logfc < 0))
# 28232 gaba peaks are higher in heroin (28232/54332=52%)
nrow(subset(glu, glu$logfc < 0))
# 45615 / 88989 = 51%
nrow(subset(olig, olig$logfc < 0))
# 26874 / 46889 = 57%
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("rots.gaba.differential.igv.log2gr0.5.bed", header=F, sep="\t")
# 571
glu <- read.delim("rots.glu.differential.igv.log2gr0.5.bed", header=F, sep="\t")
# 1053
olig <- read.delim("rots.olig.differential.igv.log2gr0.5.bed", header=F, sep="\t")
# 1414
colnames(gaba) <- c("chr", "start", "end", "log2fc")
colnames(glu) <- c("chr", "start", "end", "log2fc")
colnames(olig) <- c("chr", "start", "end", "log2fc")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
glu.int <- read.delim("GLU.DEG.txt", header=T, sep="\t")
olig.int <- read.delim("OLIG.DEG.txt", header=T, sep="\t")
gaba.deg <- subset(gaba.int, gaba.int[,3] != "NO")
# 396
glu.deg <- subset(glu.int, glu.int[,3] != "NO")
# 212
olig.deg <- subset(olig.int, olig.int[,3] != "NO")
# 164
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
gaba.gene <- genome_join_closest(gaba, gene.id, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
glu.gene <- genome_join_closest(glu, gene.id, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
olig.gene <- genome_join_closest(olig, gene.id, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
summary(gaba.gene$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 25644 9521 1206653
summary(glu.gene$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 24532 13955 753695
summary(olig.gene$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 9264 0 617791
hist(gaba.gene$distance)
hist(glu.gene$distance)
hist(olig.gene$distance)
gaba.gene.deg <- subset(gaba.gene, gaba.gene$gid %in% gaba.deg$Gene_Symbol)
nrow(gaba.gene.deg)
# 9
length(unique(gaba.gene.deg$gid))
# 8
gaba.gene.deg[,c(14,4,15)]
# gid log2fc distance
# 33 VGF 0.6992547 0
# 50 TRIB1 0.5369618 0
# 132 RCN1 0.5856225 0
# 153 MFAP3L 0.5333128 0
# 230 VGF 0.5074324 0
# 297 LINC01619 0.5886007 0
# 309 PCSK1 0.5075241 0
# 442 CDH19 0.6747794 28573 <-- DOWN RNAseq log2fc=-1.3992
# 576 C20orf203 -0.6298723 10210 <-- UP RNAseq log2fc=0.6314
summary(gaba.gene.deg$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 4309 0 28573
hist(gaba.gene.deg$distance)
glu.gene.deg <- subset(glu.gene, glu.gene$gid %in% glu.deg$Gene_Symbol)
nrow(glu.gene.deg)
# 5
length(unique(glu.gene.deg$gid))
# 5
glu.gene.deg[,c(14,4,15)]
# gid log2fc distance
# 269 STYXL2 0.5047374 0
# 500 NPTX2 0.5270811 2889 <-- DOWN RNAseq log2fc=-0.7354
# 733 ROS1 0.5004282 20702 <-- DOWN RNAseq log2fc=-0.6823
# 769 MS4A8 -0.5009932 0
# 812 LOC105373750 0.5223780 0
summary(glu.gene.deg$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 4718 2889 20702
hist(glu.gene.deg$distance)
olig.gene.deg <- subset(olig.gene, olig.gene$gid %in% olig.deg$Gene_Symbol)
nrow(olig.gene.deg)
# 11
length(unique(olig.gene.deg$gid))
# 11
olig.gene.deg[,c(14,4,15)]
# gid log2fc distance
# 223 PER1 0.6428641 0
# 302 LOC105373759 0.5502844 0
# 317 ADAMTS1 0.5011651 0
# 370 CORO1C 0.5792383 0
# 831 BCL6 0.6202792 0
# 900 TMEFF1 0.5859063 0
# 1163 PXDC1 -0.5309308 0
# 1244 ATF3 -0.5278876 0
# 1260 CCDC8 -0.5387984 1324 <-- UP RNAseq log2fc=1.0974
# 1281 LOC105371348 -0.5321454 0
# 1469 KCNN3 -0.9581082 0
summary(olig.gene.deg$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0 0.0 0.0 120.4 0.0 1324.0
hist(olig.gene.deg$distance)
#### Looking in the wrong directions?
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC
sort -k 1,1 -k 2,2n Consensus.Peaks/ROTS.differential/rots.gaba.differential.igv.log2gr0.5.bed > Consensus.Peaks/ROTS.differential/rots.gaba.differential.igv.log2gr0.5.sorted.bed
sort -k 1,1 -k 4,4n Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.gtf > Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf
bedtools2/bin/closestBed -D b -id -b Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.differential.igv.log2gr0.5.sorted.bed > gaba.genes.rots.bedtools.bed
bedtools2/bin/closestBed -D b -id -b Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.glu.differential.igv.log2gr0.5.sorted.bed > glu.genes.rots.bedtools.bed
bedtools2/bin/closestBed -D b -id -b Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.olig.differential.igv.log2gr0.5.sorted.bed > olig.genes.rots.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/S")
gaba.genes <- read.delim("gaba.genes.rots.bedtools.bed", header=F, sep="\t")
glu.genes <- read.delim("glu.genes.rots.bedtools.bed", header=F, sep="\t")
olig.genes <- read.delim("olig.genes.rots.bedtools.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
glu.int <- read.delim("GLU.DEG.txt", header=T, sep="\t")
olig.int <- read.delim("OLIG.DEG.txt", header=T, sep="\t")
gaba.deg <- subset(gaba.int, gaba.int[,3] != "NO")
# 396
glu.deg <- subset(glu.int, glu.int[,3] != "NO")
# 212
olig.deg <- subset(olig.int, olig.int[,3] != "NO")
# 164
colnames(gaba.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
gaba.genes.df <- separate(gaba.genes, "info", c("gene", "id"), sep=" ")
gaba.genes.id <- separate(gaba.genes.df, "id", c("Gene_Symbol"), sep=";")
gaba.genes.deg <- inner_join(gaba.genes.id, gaba.deg, by="Gene_Symbol")
# 10
gaba.genes.deg[,c(14,15,18,4)]
# Gene_Symbol distance log2FC log2fc
# 1 CD84 -27574 -0.9649 -0.5889022
# 2 MFAP3L 0 -0.6426 0.5333128
# 3 PCSK1 0 -0.6291 0.5075241
# 4 DSP -72454 -0.9849 0.5283299
# 5 VGF 0 -0.8737 0.5074324
# 6 VGF 0 -0.8737 0.6992547
# 7 TRIB1 0 -0.8611 0.5369618
# 8 RCN1 0 -0.5863 0.5856225
# 9 LINC01619 0 0.8301 0.5886007
# 10 CDH19 -28575 -1.3992 0.6747794
colnames(glu.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
glu.genes.df <- separate(glu.genes, "info", c("gene", "id"), sep=" ")
glu.genes.id <- separate(glu.genes.df, "id", c("Gene_Symbol"), sep=";")
glu.genes.deg <- inner_join(glu.genes.id, glu.deg, by="Gene_Symbol")
# 6
glu.genes.deg[,c(14,15,18,4)]
# Gene_Symbol distance log2FC log2fc
# 1 STYXL2 0 -0.9007 0.5047374
# 2 LOC105373750 0 -1.0805 0.5223780
# 3 ZIC1 -28858 1.8624 -0.5176765
# 4 ROS1 -20704 -0.6823 0.5004282
# 5 NPTX2 -2890 -0.7354 0.5270811
# 6 MS4A8 0 1.1606 -0.5009932
colnames(olig.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
olig.genes.df <- separate(olig.genes, "info", c("gene", "id"), sep=" ")
olig.genes.id <- separate(olig.genes.df, "id", c("Gene_Symbol"), sep=";")
olig.genes.deg <- inner_join(olig.genes.id, olig.deg, by="Gene_Symbol")
# 12
olig.genes.deg[,c(14,15,18,4)]
# Gene_Symbol distance log2FC log2fc
# 1 KCNN3 0 0.6513 -0.9581082
# 2 ATF3 0 1.2089 -0.5278876
# 3 LOC105373759 0 0.5319 0.5502844
# 4 BCL6 0 1.0079 0.6202792
# 5 PXDC1 0 0.7517 -0.5309308
# 6 MARCKS 16046 0.8853 -0.5502230
# 7 TMEFF1 0 -0.6406 0.5859063
# 8 CORO1C 0 0.6273 0.5792383
# 9 LOC105371348 0 1.1212 -0.5321454
# 10 PER1 0 1.1215 0.6428641
# 11 CCDC8 1325 1.0974 -0.5387984
# 12 ADAMTS1 0 -0.5574 0.5011651 <-- literature
adamts1 <- subset(olig.genes.deg, olig.genes.deg$Gene_Symbol == "ADAMTS1")
adamts1
# chr.peak start.peak end.peak log2fc chr.x source annotation
# 12 NC_000021.9 26844572 26845987 0.5011651 NC_000021.9 BestRefSeq gene
# start.x end dot strand.x dot2 gene Gene_Symbol distance Entrez_ID
# 12 26835755 26845409 . - . gene_id ADAMTS1 0 9510
# RNA.seq.regulation log2FC Padj chr.y start.y stop
# 12 DOWN -0.5574 0.0441 NC_000021.9 26,835,755 26,845,409
# strand.y TSS_start end_CDS TSS_2kbup CDS_2kbdown TSS_10kb_up
# 12 - 26,845,409 26,835,755 26,847,409 26,833,755 26,855,409
# CDS_10kb_down
# 12 26,825,755
#### look at the closest DEG to each differential peak (regardless of other genes in between)
bedtools2/bin/closestBed -D a -id -b RNAseq/gaba.deg.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.differential.igv.log2gr0.5.sorted.bed > gaba.deg.rots.bedtools.bed
bedtools2/bin/closestBed -D a -id -b RNAseq/glu.deg.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.glu.differential.igv.log2gr0.5.sorted.bed > glu.deg.rots.bedtools.bed
bedtools2/bin/closestBed -D a -id -b RNAseq/olig.deg.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.olig.differential.igv.log2gr0.5.sorted.bed > olig.deg.rots.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.deg <- read.delim("gaba.deg.rots.bedtools.bed", header=F, sep="\t")
glu.deg <- read.delim("glu.deg.rots.bedtools.bed", header=F, sep="\t")
olig.deg <- read.delim("olig.deg.rots.bedtools.bed", header=F, sep="\t")
gaba.deg.hun <- subset(gaba.deg, abs(gaba.deg$V14) < 100000)
gaba.deg.hun[,c(13,14,11,4)]
# 43 gene_idCD84 -27574 - -0.5889022
# 49 gene_idIRF6 -79998 - -0.5174799
# 103 gene_idLOC105377082 -26450 + -0.5632512
# 104 gene_idLOC101929054 -74655 + -0.5867222
# 157 gene_idMFAP3L 0 - 0.5333128
# 182 gene_idMIR583HG -43217 + 0.5386203
# 183 gene_idPCSK1 0 - 0.5075241
# 216 gene_idTENT5A -81342 - 0.5493052
# 257 gene_idVGF 0 - 0.5074324
# 258 gene_idVGF 0 - 0.6992547
# 259 gene_idPLOD3 -55601 - 0.5601162
# 260 gene_idPLOD3 -60807 - 0.8376871
# 287 gene_idTRIB1 0 + 0.5369618
# 289 gene_idLOC105375767 -82070 + 0.5222219
# 326 gene_idLOC105378572 -39599 + 0.5285198
# 331 gene_idRCN1 0 + 0.5856225
# 367 gene_idLOC105369843 -73556 - 0.5743998
# 375 gene_idLINC01619 0 - 0.5886007
# 474 gene_idCDH19 -28575 - 0.6747794
glu.deg.hun <- subset(glu.deg, abs(glu.deg$V14) < 100000)
glu.deg.hun[,c(13,14,11,4)]
# 11 gene_idTMEM200B -93317 - -0.5332717
# 47 gene_idSTYXL2 0 + 0.5047374
# 79 gene_idMIR3125 -28958 + 0.6581283
# 80 gene_idMIR3125 -30114 + 0.5564509
# 149 gene_idLOC105373750 0 - 0.5223780
# 388 gene_idROS1 -20704 - 0.5004282
# 410 gene_idLINC00602 -56393 + 0.5410368
# 411 gene_idLINC00602 -66888 + 0.6396936
# 494 gene_idLOC105375721 -83623 - 0.5164253
# 503 gene_idMIR30B -85676 - -0.5867338
# 629 gene_idMS4A8 0 + -0.5009932
# 632 gene_idNRXN2-AS1 -69997 + 0.5031020
# 779 gene_idLOC646548 -44574 + -0.6391800
# 846 gene_idFHOD1 -64105 - 0.5206741
# 898 gene_idHRH4 -50384 + 0.5136931
# 899 gene_idHRH4 -56248 + 0.5302194
# 1038 gene_idPNMA6F -59680 - -0.5036524
olig.deg.hun <- subset(olig.deg, abs(olig.deg$V14) < 100000)
olig.deg.hun[,c(13,14,11,4)]
# 66 gene_idKCNN3 0 - -0.9581082
# 107 gene_idATF3 0 + -0.5278876
# 183 gene_idLOC284950 -61659 + 0.6904436
# 213 gene_idLOC105373759 0 - 0.5502844
# 214 gene_idLOC105373759 -21808 - 0.5551462
# 325 gene_idBCL6 0 - 0.6202792
# 404 gene_idHMGB2 -35237 - 0.6357713
# 407 gene_idLINC02362 -91988 - 0.5314153
# 408 gene_idLINC02362 -94702 - 0.5241995
# 487 gene_idPXDC1 0 - -0.5309308
# 497 gene_idLOC105375045 -15971 + 0.5323652
# 532 gene_idMARCKS -16046 + -0.5502230
# 595 gene_idHSPB1 -75096 + 0.5198387
# 670 gene_idLOC105375674 -77734 - 0.5480409
# 720 gene_idTMEFF1 0 + 0.5859063
# 820 gene_idYPEL4 -16844 - 0.6668689
# 821 gene_idYPEL4 -61678 - 0.5966353
# 919 gene_idCORO1C 0 - 0.5792383
# 920 gene_idCORO1C -66241 - -0.5928662
# 929 gene_idUBC -2372 - 0.5895206
# 998 gene_idIRF2BPL -11266 - 0.6627939
# 1028 gene_idRASL12 -84078 - -0.5622389
# 1073 gene_idZNF843 -15482 - 0.5845965
# 1083 gene_idADGRG3 -45474 + 0.5199222
# 1090 gene_idLOC105371348 0 + -0.5321454
# 1102 gene_idPER1 0 - 0.6428641
# 1166 gene_idRAB31 -50767 + 0.5046846
# 1167 gene_idRAB31 -98475 + -0.5212285
# 1200 gene_idGADD45B -71183 + 0.5040298
# 1240 gene_idERF -12910 - 0.8514077
# 1317 gene_idADAMTS1 0 - 0.5011651
# 1318 gene_idLOC105372785 -79620 + 0.6012385
## what about the closest differential peak to each deg
bedtools2/bin/closestBed -D a -id -a RNAseq/gaba.deg.sorted.gtf -b Consensus.Peaks/ROTS.differential/rots.gaba.differential.igv.log2gr0.5.sorted.bed > gaba.closestpeaktodeg.bedtools.bed
bedtools2/bin/closestBed -D a -id -a RNAseq/glu.deg.sorted.gtf -b Consensus.Peaks/ROTS.differential/rots.glu.differential.igv.log2gr0.5.sorted.bed > glu.closestpeaktodeg.bedtools.bed
bedtools2/bin/closestBed -D a -id -a RNAseq/olig.deg.sorted.gtf -b Consensus.Peaks/ROTS.differential/rots.olig.differential.igv.log2gr0.5.sorted.bed > olig.closestpeaktodeg.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.deg <- read.delim("gaba.closestpeaktodeg.bedtools.bed", header=F, sep="\t")
glu.deg <- read.delim("glu.closestpeaktodeg.bedtools.bed", header=F, sep="\t")
olig.deg <- read.delim("olig.closestpeaktodeg.bedtools.bed", header=F, sep="\t")
gaba.deg.hun <- subset(gaba.deg, abs(gaba.deg$V14) < 100000)
gaba.deg.hun[,c(9,14)]
5 gene_idGPR3 -84730
18 gene_idCD84 -27574
23 gene_idIRF6 -79998
99 gene_idMFAP3L 0
111 gene_idPCSK1 0
126 gene_idDSP -72454
135 gene_idTENT5A -81342
158 gene_idVGF 0
159 gene_idVGF 0
160 gene_idPLOD3 -55601
182 gene_idTRIB1 0
225 gene_idRCN1 0
251 gene_idLOC105369843 -73556
253 gene_idLINC01619 0
327 gene_idCDH19 -28575
346 gene_idFOSB -76170
386 gene_idLOC105373181 -15388
392 gene_idPLP1 -88925
–> characterize… relative to genes (intron, exon, promoter (2kb, 5kb, 10kb), distal)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.genes <- read.delim("gaba.genes.rots.bedtools.bed", header=F, sep="\t")
glu.genes <- read.delim("glu.genes.rots.bedtools.bed", header=F, sep="\t")
olig.genes <- read.delim("olig.genes.rots.bedtools.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
glu.int <- read.delim("GLU.DEG.txt", header=T, sep="\t")
olig.int <- read.delim("OLIG.DEG.txt", header=T, sep="\t")
gaba.deg <- subset(gaba.int, gaba.int[,3] != "NO")
glu.deg <- subset(glu.int, glu.int[,3] != "NO")
olig.deg <- subset(olig.int, olig.int[,3] != "NO")
colnames(gaba.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
gaba.genes.df <- separate(gaba.genes, "info", c("gene", "id"), sep=" ")
gaba.genes.id <- separate(gaba.genes.df, "id", c("Gene_Symbol"), sep=";")
gaba.genes.class <- gaba.genes.id %>% mutate(class = ifelse(distance > 10000, "downstream_distal_10kb", ifelse(distance > 2000, "downstream_distal_2kb", ifelse(distance > 0, "downstream_promoter", ifelse(distance == 0, "genic", ifelse(distance < -10000, "upstream_distal_10kb", ifelse(distance < -2000, "upstream_distal_2kb", ifelse(distance < 0, "upstream_promoter", "NA")))))))) %>% mutate(DEG = ifelse(Gene_Symbol %in% gaba.deg$Gene_Symbol, "DEG", "nonDEG"))
gaba.genes.class.tss <- gaba.genes.class %>% mutate(gene.length = abs(end-start), tss.distance = ifelse(class == "genic", ifelse(strand == "+", as.numeric(start.peak - start), ifelse(strand == "-", as.numeric(end - end.peak), NA)), distance)) %>% mutate(class.quarter = ifelse(class == "genic", ifelse(tss.distance/gene.length <= 0.25, "genic.first", ifelse(tss.distance/gene.length <= 0.50, "genic.second", ifelse(tss.distance/gene.length <= 0.75, "genic.third", ifelse(tss.distance/gene.length <= 1, "genic.fourth", "genic.unknown")))), class))
gaba.genes.class.count <- gaba.genes.class.tss %>% group_by(class.quarter) %>% mutate(class.count = n())
gaba.genes.class.deg <- gaba.genes.class.tss %>% group_by(class.quarter, DEG) %>% mutate(DEG.class.count = n())
gaba.genes.class.count.uniq <- unique(gaba.genes.class.count[,c(20,21)])
# class.quarter class.count
# <chr> <int>
# 1 upstream_promoter 4
# 2 genic.third 94
# 3 genic.fourth 94
# 4 upstream_distal_2kb 25
# 5 genic.first 128
# 6 genic.second 81
# 7 upstream_distal_10kb 174
gaba.genes.class.deg.uniq <- unique(gaba.genes.class.deg[,c(17,20,21)])
# DEG class.quarter DEG.class.count
# <chr> <chr> <int>
# 1 nonDEG upstream_promoter 4
# 2 nonDEG genic.third 91
# 3 nonDEG genic.fourth 92
# 4 nonDEG upstream_distal_2kb 25
# 5 nonDEG genic.first 126
# 6 nonDEG genic.second 81
# 7 nonDEG upstream_distal_10kb 171
# 8 DEG upstream_distal_10kb 3
# 9 DEG genic.first 2
# 10 DEG genic.third 3
# 11 DEG genic.fourth 2
gaba.genes.class.deg.prop <- gaba.genes.class.deg.uniq %>% mutate(total = ifelse(DEG == "nonDEG", 590, 10))
gaba.genes.class.deg.prop$proportion <- gaba.genes.class.deg.prop$DEG.class.count / gaba.genes.class.deg.prop$total
gaba.genes.class.deg.prop$celltype <- "GABA"
colnames(glu.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
glu.genes.df <- separate(glu.genes, "info", c("gene", "id"), sep=" ")
glu.genes.id <- separate(glu.genes.df, "id", c("Gene_Symbol"), sep=";")
glu.genes.class <- glu.genes.id %>% mutate(class = ifelse(distance > 10000, "downstream_distal_10kb", ifelse(distance > 2000, "downstream_distal_2kb", ifelse(distance > 0, "downstream_promoter", ifelse(distance == 0, "genic", ifelse(distance < -10000, "upstream_distal_10kb", ifelse(distance < -2000, "upstream_distal_2kb", ifelse(distance < 0, "upstream_promoter", "NA")))))))) %>% mutate(DEG = ifelse(Gene_Symbol %in% glu.deg$Gene_Symbol, "DEG", "nonDEG"))
glu.genes.class.tss <- glu.genes.class %>% mutate(gene.length = abs(end-start), tss.distance = ifelse(class == "genic", ifelse(strand == "+", as.numeric(start.peak - start), ifelse(strand == "-", as.numeric(end - end.peak), NA)), distance)) %>% mutate(class.quarter = ifelse(class == "genic", ifelse(tss.distance/gene.length <= 0.25, "genic.first", ifelse(tss.distance/gene.length <= 0.50, "genic.second", ifelse(tss.distance/gene.length <= 0.75, "genic.third", ifelse(tss.distance/gene.length <= 1, "genic.fourth", "genic.unknown")))), class))
glu.genes.class.count <- glu.genes.class.tss %>% group_by(class.quarter) %>% mutate(class.count = n())
glu.genes.class.deg <- glu.genes.class.tss %>% group_by(class.quarter, DEG) %>% mutate(DEG.class.count = n())
glu.genes.class.count.uniq <- unique(glu.genes.class.count[,c(20,21)])
# class.quarter class.count
# <chr> <int>
# 1 genic.fourth 182
# 2 genic.first 179
# 3 upstream_distal_10kb 359
# 4 genic.second 168
# 5 genic.third 147
# 6 upstream_distal_2kb 59
# 7 upstream_promoter 19
glu.genes.class.deg.uniq <- unique(glu.genes.class.deg[,c(17,20,21)])
# DEG class.quarter DEG.class.count
# <chr> <chr> <int>
# 1 nonDEG genic.fourth 180
# 2 nonDEG genic.first 178
# 3 nonDEG upstream_distal_10kb 357
# 4 nonDEG genic.second 168
# 5 nonDEG genic.third 147
# 6 nonDEG upstream_distal_2kb 58
# 7 DEG genic.fourth 2
# 8 nonDEG upstream_promoter 19
# 9 DEG upstream_distal_10kb 2
# 10 DEG upstream_distal_2kb 1
# 11 DEG genic.first 1
glu.genes.class.deg.prop <- glu.genes.class.deg.uniq %>% mutate(total = ifelse(DEG == "nonDEG", 1107, 6))
glu.genes.class.deg.prop$proportion <- glu.genes.class.deg.prop$DEG.class.count / glu.genes.class.deg.prop$total
glu.genes.class.deg.prop$celltype <- "GLU"
colnames(olig.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
olig.genes.df <- separate(olig.genes, "info", c("gene", "id"), sep=" ")
olig.genes.id <- separate(olig.genes.df, "id", c("Gene_Symbol"), sep=";")
olig.genes.class <- olig.genes.id %>% mutate(class = ifelse(distance > 10000, "downstream_distal_10kb", ifelse(distance > 2000, "downstream_distal_2kb", ifelse(distance > 0, "downstream_promoter", ifelse(distance == 0, "genic", ifelse(distance < -10000, "upstream_distal_10kb", ifelse(distance < -2000, "upstream_distal_2kb", ifelse(distance < 0, "upstream_promoter", "NA")))))))) %>% mutate(DEG = ifelse(Gene_Symbol %in% olig.deg$Gene_Symbol, "DEG", "nonDEG"))
olig.genes.class.tss <- olig.genes.class %>% mutate(gene.length = abs(end-start), tss.distance = ifelse(class == "genic", ifelse(strand == "+", as.numeric(start.peak - start), ifelse(strand == "-", as.numeric(end - end.peak), NA)), distance)) %>% mutate(class.quarter = ifelse(class == "genic", ifelse(tss.distance/gene.length <= 0.25, "genic.first", ifelse(tss.distance/gene.length <= 0.50, "genic.second", ifelse(tss.distance/gene.length <= 0.75, "genic.third", ifelse(tss.distance/gene.length <= 1, "genic.fourth", "genic.unknown")))), class))
olig.genes.class.count <- olig.genes.class.tss %>% group_by(class.quarter) %>% mutate(class.count = n())
olig.genes.class.deg <- olig.genes.class.tss %>% group_by(class.quarter, DEG) %>% mutate(DEG.class.count = n())
olig.genes.class.count.uniq <- unique(olig.genes.class.count[,c(20,21)])
# class.quarter class.count
# <chr> <int>
# 1 upstream_promoter 29
# 2 genic.second 200
# 3 genic.first 745
# 4 genic.third 182
# 5 upstream_distal_10kb 274
# 6 genic.fourth 196
# 7 upstream_distal_2kb 61
olig.genes.class.deg.uniq <- unique(olig.genes.class.deg[,c(17,20,21)])
# DEG class.quarter DEG.class.count
# <chr> <chr> <int>
# 1 nonDEG upstream_promoter 29
# 2 nonDEG genic.second 200
# 3 nonDEG genic.first 738
# 4 nonDEG genic.third 180
# 5 nonDEG upstream_distal_10kb 274
# 6 nonDEG genic.fourth 195
# 7 nonDEG upstream_distal_2kb 59
# 8 DEG genic.third 2
# 9 DEG genic.first 7
# 10 DEG upstream_distal_2kb 2
# 11 DEG genic.fourth 1
olig.genes.class.deg.prop <- olig.genes.class.deg.uniq %>% mutate(total = ifelse(DEG == "nonDEG", 1675, 12))
olig.genes.class.deg.prop$proportion <- olig.genes.class.deg.prop$DEG.class.count / olig.genes.class.deg.prop$total
olig.genes.class.deg.prop$celltype <- "OLIG"
library(ggplot2)
genes.class.deg.prop <- rbind(gaba.genes.class.deg.prop, glu.genes.class.deg.prop, olig.genes.class.deg.prop)
ggplot(genes.class.deg.prop, aes(x=DEG, y=proportion, fill=class.quarter)) + geom_bar(stat="identity") + theme_classic() + facet_grid(. ~ celltype)
## add directionality of differential peak (present in control or heroin?)
olig.genes.class.tss.direction <- olig.genes.class.tss %>% mutate(peak.direction = ifelse(log2fc > 0, "control.peak", ifelse(log2fc < 0, "heroin.peak", "unknown.peak")))
olig.genes.class.direction <- olig.genes.class.tss.direction %>% group_by(class.quarter, DEG, peak.direction) %>% mutate(DEG.class.direction.count = n())
olig.genes.class.direction.uniq <- unique(olig.genes.class.direction[,c(17,20:22)])
# DEG class.quarter peak.direction DEG.class.direction.count
# <chr> <chr> <chr> <int>
# 1 nonDEG upstream_promoter control.peak 22
# 2 nonDEG genic.second control.peak 110
# 3 nonDEG genic.first control.peak 648
# 4 nonDEG genic.first heroin.peak 90
# 5 nonDEG genic.third heroin.peak 70
# 6 nonDEG upstream_distal_10kb control.peak 129
# 7 nonDEG genic.second heroin.peak 90
# 8 nonDEG genic.fourth control.peak 99
# 9 nonDEG upstream_distal_10kb heroin.peak 145
# 10 nonDEG upstream_distal_2kb heroin.peak 28
# 11 nonDEG genic.fourth heroin.peak 96
# 12 nonDEG upstream_distal_2kb control.peak 31
# 13 nonDEG genic.third control.peak 110
# 14 DEG genic.third heroin.peak 1
# 15 DEG genic.first heroin.peak 3
# 16 nonDEG upstream_promoter heroin.peak 7
# 17 DEG genic.first control.peak 4
# 18 DEG upstream_distal_2kb control.peak 2
# 19 DEG genic.fourth control.peak 1
# 20 DEG genic.third control.peak 1
olig.genes.class.direction.prop <- olig.genes.class.direction.uniq %>% mutate(total = ifelse(DEG == "nonDEG" & peak.direction == "control.peak", 1149, ifelse(DEG == "nonDEG" & peak.direction == "heroin.peak", 526, ifelse(DEG == "DEG" & peak.direction == "control.peak", 8, 4))))
olig.genes.class.direction.prop$proportion <- olig.genes.class.direction.prop$DEG.class.direction.count / olig.genes.class.direction.prop$total
olig.genes.class.direction.prop$celltype <- "OLIG"
gaba.genes.class.tss.direction <- gaba.genes.class.tss %>% mutate(peak.direction = ifelse(log2fc > 0, "control.peak", ifelse(log2fc < 0, "heroin.peak", "unknown.peak")))
gaba.genes.class.direction <- gaba.genes.class.tss.direction %>% group_by(class.quarter, DEG, peak.direction) %>% mutate(DEG.class.direction.count = n())
gaba.genes.class.direction.uniq <- unique(gaba.genes.class.direction[,c(17,20:22)])
gaba.genes.class.direction.prop <- gaba.genes.class.direction.uniq %>% mutate(total = ifelse(DEG == "nonDEG" & peak.direction == "control.peak", 397, ifelse(DEG == "nonDEG" & peak.direction == "heroin.peak", 193, ifelse(DEG == "DEG" & peak.direction == "control.peak", 9, 1))))
gaba.genes.class.direction.prop$proportion <- gaba.genes.class.direction.prop$DEG.class.direction.count / gaba.genes.class.direction.prop$total
gaba.genes.class.direction.prop$celltype <- "GABA"
glu.genes.class.tss.direction <- glu.genes.class.tss %>% mutate(peak.direction = ifelse(log2fc > 0, "control.peak", ifelse(log2fc < 0, "heroin.peak", "unknown.peak")))
glu.genes.class.direction <- glu.genes.class.tss.direction %>% group_by(class.quarter, DEG, peak.direction) %>% mutate(DEG.class.direction.count = n())
glu.genes.class.direction.uniq <- unique(glu.genes.class.direction[,c(17,20:22)])
glu.genes.class.direction.prop <- glu.genes.class.direction.uniq %>% mutate(total = ifelse(DEG == "nonDEG" & peak.direction == "control.peak", 628, ifelse(DEG == "nonDEG" & peak.direction == "heroin.peak", 479, ifelse(DEG == "DEG" & peak.direction == "control.peak", 4, 2))))
glu.genes.class.direction.prop$proportion <- glu.genes.class.direction.prop$DEG.class.direction.count / glu.genes.class.direction.prop$total
glu.genes.class.direction.prop$celltype <- "GLU"
library(ggplot2)
genes.class.direction.prop <- rbind(gaba.genes.class.direction.prop, glu.genes.class.direction.prop, olig.genes.class.direction.prop)
ggplot(genes.class.direction.prop, aes(x=peak.direction, y=proportion, fill=class.quarter)) + geom_bar(stat="identity") + theme_classic() + facet_grid(celltype ~ DEG)
ggplot(genes.class.direction.prop, aes(x=celltype, y=DEG.class.direction.count, fill=peak.direction)) + geom_bar(stat="identity") + theme_classic() + facet_grid(DEG ~ class.quarter, scales="free")
#### Look into specific examples
######### non-genic (upstream peaks)
glu.genes.class.DEG.nongenic <- subset(glu.genes.class.direction, glu.genes.class.direction$DEG == "DEG" & glu.genes.class.direction$class != "genic")
glu.genes.class.DEG.nongenic$Gene_Symbol
# [1] "ZIC1" "ROS1" "NPTX2"
glu.genes.class.DEG.nongenic[,c(4,14,15)]
# log2fc Gene_Symbol distance
# <dbl> <chr> <int>
# 1 -0.518 ZIC1 -28858 <-- heroin peak, up-regulated in heroin
# 2 0.500 ROS1 -20704 <-- control peak
# 3 0.527 NPTX2 -2890 <-- control peak
gaba.genes.class.DEG.nongenic <- subset(gaba.genes.class.direction, gaba.genes.class.direction$DEG == "DEG" & gaba.genes.class.direction$class != "genic")
gaba.genes.class.DEG.nongenic$Gene_Symbol
# [1] "CD84" "DSP" "CDH19"
gaba.genes.class.DEG.nongenic[,c(4,14,15)]
# log2fc Gene_Symbol distance
# <dbl> <chr> <int>
# 1 -0.589 CD84 -27574
# 2 0.528 DSP -72454
# 3 0.675 CDH19 -28575
olig.genes.class.DEG.nongenic <- subset(olig.genes.class.direction, olig.genes.class.direction$DEG == "DEG" & olig.genes.class.direction$class != "genic")
olig.genes.class.DEG.nongenic$Gene_Symbol
# [1] "HSPB1" "UBC"
olig.genes.class.DEG.nongenic[,c(4,14,15)]
# log2fc Gene_Symbol distance
# <dbl> <chr> <int>
# 1 0.522 HSPB1 -7378
# 2 0.590 UBC -2372
######### genic (look into exon vs intron peaks)
glu.genes.class.DEG.genic <- subset(glu.genes.class.direction, glu.genes.class.direction$DEG == "DEG" & glu.genes.class.direction$class == "genic")
glu.genes.class.DEG.genic[,c(4,14,20)]
# log2fc Gene_Symbol class.quarter
# <dbl> <chr> <chr>
# 1 0.505 STYXL2 genic.fourth <- exon; heroin down-regulated
# 2 0.522 LOC105373750 genic.fourth <- intron; heroin down-regulated
# 3 -0.501 MS4A8 genic.first
gaba.genes.class.DEG.genic <- subset(gaba.genes.class.direction, gaba.genes.class.direction$DEG == "DEG" & gaba.genes.class.direction$class == "genic")
gaba.genes.class.DEG.genic[,c(4,14,20)]
# log2fc Gene_Symbol class.quarter
# <dbl> <chr> <chr>
# 1 0.533 MFAP3L genic.first <- exon
# 2 0.508 PCSK1 genic.third <- exon/intron
# 3 0.507 VGF genic.fourth <- exon
# 4 0.699 VGF genic.fourth <- exon
# 5 0.537 TRIB1 genic.first <- TSS
# 6 0.586 RCN1 genic.third
# 7 0.589 LINC01619 genic.third <- intron
olig.genes.class.DEG.genic <- subset(olig.genes.class.direction, olig.genes.class.direction$DEG == "DEG" & olig.genes.class.direction$class == "genic")
olig.genes.class.DEG.genic[,c(4,14,20)]
# log2fc Gene_Symbol class.quarter
# <dbl> <chr> <chr>
# 1 -0.958 KCNN3 genic.third <- intron
# 2 -0.528 ATF3 genic.first <- intron
# 3 0.550 LOC105373759 genic.first <- TSS
# 4 0.620 BCL6 genic.first
# 5 -0.531 PXDC1 genic.first <- intron
# 6 0.586 TMEFF1 genic.fourth <- intron
# 7 0.579 CORO1C genic.third <- intron
# 8 -0.532 LOC105371348 genic.first <- intron
# 9 0.643 PER1 genic.first <- TSS
# 10 0.501 ADAMTS1 genic.first <- TSS
cd /Users/27n/Dropbox (ORNL)/Shared/Jail-room Share/Projects/Mt_Sinai_Opioids/ATAC.Consensus.Peaks/PsychENCODE_hg19_enhancers
sed 's/chr10/NC_000010.11/g' DER-03b_hg19_high_confidence_PEC_enhancers.bed | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > DER-03b_hg38_high_confidence_PEC_enhancers.bed
sed 's/chr10/NC_000010.11/g' DER-03a_hg19_PEC_enhancers.bed | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > DER-03a_hg38_PEC_enhancers.bed
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/Shared/Jail-room Share/Projects/Mt_Sinai_Opioids/ATAC.Consensus.Peaks/PsychENCODE_hg19_enhancers")
enhancer <- read.delim("DER-03b_hg38_high_confidence_PEC_enhancers.bed", header=F, sep="\t")
colnames(enhancer) <- c("Chr", "Start", "End", "Enhancer.ID")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("myresults_lower1_pfalse_gaba.bed", header=T, sep=" ")
glu <- read.delim("myresults_lower1_pfalse_glu.bed", header=T, sep=" ")
olig <- read.delim("myresults_lower1_pfalse_olig.bed", header=T, sep=" ")
gaba.enhancer <- genome_intersect(gaba, enhancer, by=c("Chr", "Start", "End"))
# 1170
glu.enhancer <- genome_intersect(glu, enhancer, by=c("Chr", "Start", "End"))
# 1801
olig.enhancer <- genome_intersect(olig, enhancer, by=c("Chr", "Start", "End"))
# 1135
nrow(subset(gaba.enhancer, abs(gaba.enhancer$logfc) > 0.5))
# 5
nrow(subset(glu.enhancer, abs(glu.enhancer$logfc) > 0.5))
# 18
nrow(subset(olig.enhancer, abs(olig.enhancer$logfc) > 0.5))
# 34
### using all enhancers (not just high confidence)
setwd("/Users/27n/Dropbox (ORNL)/Shared/Jail-room Share/Projects/Mt_Sinai_Opioids/ATAC.Consensus.Peaks/PsychENCODE_hg19_enhancers")
enhancer <- read.delim("DER-03a_hg38_PEC_enhancers.bed", header=F, sep="\t")
colnames(enhancer) <- c("Chr", "Start", "End", "Enhancer.ID")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("myresults_lower1_pfalse_gaba.bed", header=T, sep=" ")
glu <- read.delim("myresults_lower1_pfalse_glu.bed", header=T, sep=" ")
olig <- read.delim("myresults_lower1_pfalse_olig.bed", header=T, sep=" ")
gaba.enhancer <- genome_intersect(gaba, enhancer, by=c("Chr", "Start", "End"))
glu.enhancer <- genome_intersect(glu, enhancer, by=c("Chr", "Start", "End"))
olig.enhancer <- genome_intersect(olig, enhancer, by=c("Chr", "Start", "End"))
nrow(subset(gaba.enhancer, abs(gaba.enhancer$logfc) > 0.5))
# 5
nrow(subset(glu.enhancer, abs(glu.enhancer$logfc) > 0.5))
# 18
nrow(subset(olig.enhancer, abs(olig.enhancer$logfc) > 0.5))
# 34
#### look at overlap with all peaks (not just differential)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/diffbind.dba.66")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.bed", header=F, sep="\t")
glu.atac.opioid <- read.delim("diffbind.MACS2.consensus.glu.heroin.bed", header=F, sep="\t")
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.bed", header=F, sep="\t")
gaba.atac.opioid <- read.delim("diffbind.MACS2.consensus.gaba.heroin.bed", header=F, sep="\t")
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.bed", header=F, sep="\t")
olig.atac.opioid <- read.delim("diffbind.MACS2.consensus.olig.heroin.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/Shared/Jail-room Share/Projects/Mt_Sinai_Opioids/ATAC.Consensus.Peaks/PsychENCODE_hg19_enhancers")
enhancer <- read.delim("DER-03a_hg38_PEC_enhancers.bed", header=F, sep="\t")
colnames(enhancer) <- c("V1", "V2", "V3", "Enhancer.ID")
glu.control.enhancer <- genome_intersect(glu.atac.control, enhancer, by=c("V1", "V2", "V3"))
nrow(glu.atac.control)
# 83915
nrow(glu.control.enhancer)
# 6052
glu.heroin.enhancer <- genome_intersect(glu.atac.opioid, enhancer, by=c("V1", "V2", "V3"))
nrow(glu.atac.opioid)
# 78655
nrow(glu.heroin.enhancer)
# 5823
gaba.control.enhancer <- genome_intersect(gaba.atac.control, enhancer, by=c("V1", "V2", "V3"))
nrow(gaba.atac.control)
# 51337
nrow(gaba.control.enhancer)
# 4024
gaba.heroin.enhancer <- genome_intersect(gaba.atac.opioid, enhancer, by=c("V1", "V2", "V3"))
nrow(gaba.atac.opioid)
# 47329
nrow(gaba.heroin.enhancer)
# 3802
olig.control.enhancer <- genome_intersect(olig.atac.control, enhancer, by=c("V1", "V2", "V3"))
nrow(olig.atac.control)
# 38060
nrow(olig.control.enhancer)
# 3368
olig.heroin.enhancer <- genome_intersect(olig.atac.opioid, enhancer, by=c("V1", "V2", "V3"))
nrow(olig.atac.opioid)
# 46054
nrow(olig.heroin.enhancer)
# 3846
setwd("/Users/27n/Dropbox (ORNL)/Shared/Jail-room Share/Projects/Mt_Sinai_Opioids/ATAC.Consensus.Peaks/PsychENCODE_hg19_enhancers")
enhancer <- read.delim("DER-03b_hg38_high_confidence_PEC_enhancers.bed", header=F, sep="\t")
colnames(enhancer) <- c("V1", "V2", "V3", "Enhancer.ID")
glu.control.enhancer <- genome_intersect(glu.atac.control, enhancer, by=c("V1", "V2", "V3"))
nrow(glu.atac.control)
# 83915
nrow(glu.control.enhancer)
# 1662
glu.heroin.enhancer <- genome_intersect(glu.atac.opioid, enhancer, by=c("V1", "V2", "V3"))
nrow(glu.atac.opioid)
# 78655
nrow(glu.heroin.enhancer)
# 1567
gaba.control.enhancer <- genome_intersect(gaba.atac.control, enhancer, by=c("V1", "V2", "V3"))
nrow(gaba.atac.control)
# 51337
nrow(gaba.control.enhancer)
# 1086
gaba.heroin.enhancer <- genome_intersect(gaba.atac.opioid, enhancer, by=c("V1", "V2", "V3"))
nrow(gaba.atac.opioid)
# 47329
nrow(gaba.heroin.enhancer)
# 1000
olig.control.enhancer <- genome_intersect(olig.atac.control, enhancer, by=c("V1", "V2", "V3"))
nrow(olig.atac.control)
# 38060
nrow(olig.control.enhancer)
# 972
olig.heroin.enhancer <- genome_intersect(olig.atac.opioid, enhancer, by=c("V1", "V2", "V3"))
nrow(olig.atac.opioid)
# 46054
nrow(olig.heroin.enhancer)
# 1091
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq/")
olig.chip <- read.delim("H.276.OLIG_peaks.chrfix.broadPeak", header=F, sep="\t")
# 68563
glu.chip <- read.delim("H.372.GLU_peaks.chrfix.broadPeak", header=F, sep="\t")
# 94325
colnames(olig.chip) <- c("Chr", "Start", "End", "Chip.ID", "peak", "strand", "x", "y", "z")
colnames(glu.chip) <- c("Chr", "Start", "End", "Chip.ID", "peak", "strand", "x", "y", "z")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("myresults_lower1_pfalse_gaba.bed", header=T, sep=" ")
# 54332
glu <- read.delim("myresults_lower1_pfalse_glu.bed", header=T, sep=" ")
# 88989
olig <- read.delim("myresults_lower1_pfalse_olig.bed", header=T, sep=" ")
# 46889
gaba.chip.overlap <- genome_intersect(gaba, gaba.chip, by=c("Chr", "Start", "End"))
glu.chip.overlap <- genome_intersect(glu, glu.chip, by=c("Chr", "Start", "End"))
# 56817 / 88989 = 0.6384722
olig.chip.overlap <- genome_intersect(olig, olig.chip, by=c("Chr", "Start", "End"))
# 32683 / 46889 = 0.6970292
nrow(subset(gaba.chip.overlap, abs(gaba.chip.overlap$logfc) > 0.5))
#
nrow(subset(glu.chip.overlap, abs(glu.chip.overlap$logfc) > 0.5))
# 575 / 1053 glu differential peaks = 0.5460589
nrow(subset(olig.chip.overlap, abs(olig.chip.overlap$logfc) > 0.5))
# 1168 / 1414 olig differential peaks = 0.8260255
### All peaks
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq/")
olig.chip <- read.delim("H.276.OLIG_peaks.chrfix.broadPeak", header=F, sep="\t")
glu.chip <- read.delim("H.372.GLU_peaks.chrfix.broadPeak", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/diffbind.dba.66")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.bed", header=F, sep="\t")
# 83915
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.bed", header=F, sep="\t")
# 51337
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.bed", header=F, sep="\t")
# 38060
gaba.atac.control.chip <- genome_intersect(gaba.atac.control, gaba.chip, by=c("V1", "V2", "V3"))
glu.atac.control.chip <- genome_intersect(glu.atac.control, glu.chip, by=c("V1", "V2", "V3"))
# 54780 / 83915 = 0.6528034 atac peaks
olig.atac.control.chip <- genome_intersect(olig.atac.control, olig.chip, by=c("V1", "V2", "V3"))
# 28824 / 38060 = 0.7573305 atac peaks
–> with merged ChIP-seq data
library(tidygenomics)
library(dplyr)
library(tidyr)
## All peaks
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq/")
olig.chip <- read.delim("olig.merged.peaks.bed", header=F, sep="\t")
glu.chip <- read.delim("glu.merged.peaks.bed", header=F, sep="\t")
gaba.chip <- read.delim("gaba.merged.peaks.bed", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/diffbind.dba.66")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.bed", header=F, sep="\t")
# 83915
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.bed", header=F, sep="\t")
# 51337
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.bed", header=F, sep="\t")
# 38060
gaba.atac.control.chip <- genome_intersect(gaba.atac.control, gaba.chip, by=c("V1", "V2", "V3"))
length(unique(gaba.atac.control.chip$V4.x))
# 43102 / 51337 = 0.8395894 atac peaks
glu.atac.control.chip <- genome_intersect(glu.atac.control, glu.chip, by=c("V1", "V2", "V3"))
length(unique(glu.atac.control.chip$V4.x))
# 68842 / 83915 = 0.8203778 atac peaks
olig.atac.control.chip <- genome_intersect(olig.atac.control, olig.chip, by=c("V1", "V2", "V3"))
length(unique(olig.atac.control.chip$V4.x))
# 30756 / 38060 = 0.8080925 atac peaks
## ROTS peaks
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq/")
olig.chip <- read.delim("olig.merged.peaks.bed", header=F, sep="\t")
# 170081
glu.chip <- read.delim("glu.merged.peaks.bed", header=F, sep="\t")
# 497850
gaba.chip <- read.delim("gaba.merged.peaks.bed", header=F, sep="\t")
# 437483
colnames(olig.chip) <- c("Chr", "Start", "End", "chip.scores")
colnames(glu.chip) <- c("Chr", "Start", "End", "chip.scores")
colnames(gaba.chip) <- c("Chr", "Start", "End", "chip.scores")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("myresults_lower1_pfalse_gaba.bed", header=T, sep=" ")
# 54332
glu <- read.delim("myresults_lower1_pfalse_glu.bed", header=T, sep=" ")
# 88989
olig <- read.delim("myresults_lower1_pfalse_olig.bed", header=T, sep=" ")
# 46889
gaba.chip.overlap <- genome_intersect(gaba, gaba.chip, by=c("Chr", "Start", "End"))
glu.chip.overlap <- genome_intersect(glu, glu.chip, by=c("Chr", "Start", "End"))
olig.chip.overlap <- genome_intersect(olig, olig.chip, by=c("Chr", "Start", "End"))
length(unique(gaba.chip.overlap$GeneID))
# 44907 / 54332 ROTS peaks = 0.8265295
length(unique(glu.chip.overlap$GeneID))
# 71718 / 88989 ROTS peaks = 0.8059198
length(unique(olig.chip.overlap$GeneID))
# 35517 / 46889 ROTS peaks = 0.7574698
gaba.chip.overlap.diff <- subset(gaba.chip.overlap, abs(gaba.chip.overlap$logfc) > 0.5)
length(unique(gaba.chip.overlap.diff$GeneID))
# 458 / 571 gaba differential peaks = 0.8021016
glu.chip.overlap.diff <- subset(glu.chip.overlap, abs(glu.chip.overlap$logfc) > 0.5)
length(unique(glu.chip.overlap.diff$GeneID))
# 743 / 1053 glu differential peaks = 0.705603
olig.chip.overlap.diff <- subset(olig.chip.overlap, abs(olig.chip.overlap$logfc) > 0.5)
length(unique(olig.chip.overlap.diff$GeneID))
# 1220 / 1414 olig differential peaks = 0.8628006
#####classify overlap... how many BP / proportion?
gaba.chip.overlap.diff$overlap <- gaba.chip.overlap.diff$End - gaba.chip.overlap.diff$Start
gaba.chip.overlap.diff$overlap.prop <- gaba.chip.overlap.diff$overlap / gaba.chip.overlap.diff$width
summary(gaba.chip.overlap.diff$overlap.prop)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.00165 0.97120 1.00000 0.86823 1.00000 1.00000
olig.chip.overlap.diff$overlap <- olig.chip.overlap.diff$End - olig.chip.overlap.diff$Start
olig.chip.overlap.diff$overlap.prop <- olig.chip.overlap.diff$overlap / olig.chip.overlap.diff$width
summary(olig.chip.overlap.diff$overlap.prop)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.01447 0.96712 1.00000 0.89800 1.00000 1.00000
glu.chip.overlap.diff$overlap <- glu.chip.overlap.diff$End - glu.chip.overlap.diff$Start
glu.chip.overlap.diff$overlap.prop <- glu.chip.overlap.diff$overlap / glu.chip.overlap.diff$width
summary(glu.chip.overlap.diff$overlap.prop)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.001996 0.703742 1.000000 0.816037 1.000000 1.000000
# make ROTS_input.csv file with consensus peaks and bam directories
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("diffbind.macs2.outlier.sample.csv", header=T, sep=",")
gaba.control <- subset(df, df$Condition == "control" & df$Tissue == "gaba")
gaba.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "gaba")
glu.control <- subset(df, df$Condition == "control" & df$Tissue == "glu")
glu.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "glu")
olig.control <- subset(df, df$Condition == "control" & df$Tissue == "olig")
olig.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "olig")
gaba.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam/genrich.gaba.control.narrowPeak"
gaba.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam/genrich.gaba.heroin.narrowPeak"
glu.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam/genrich.glu.control.narrowPeak"
glu.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam/genrich.glu.heroin.narrowPeak"
olig.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam/genrich.olig.control.narrowPeak"
olig.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/sam/genrich.olig.heroin.narrowPeak"
gaba.control$rep <- 1
gaba.heroin$rep <- 2
glu.control$rep <- 1
glu.heroin$rep <- 2
olig.control$rep <- 1
olig.heroin$rep <- 2
gaba.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.control$bamReads)
gaba.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.heroin$bamReads)
glu.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", glu.control$bamReads)
glu.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", glu.heroin$bamReads)
olig.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", olig.control$bamReads)
olig.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", olig.heroin$bamReads)
gaba <- rbind(gaba.control[,c(12,14,13)],gaba.heroin[,c(12,14,13)])
glu <- rbind(glu.control[,c(12,14,13)],glu.heroin[,c(12,14,13)])
olig <- rbind(olig.control[,c(12,14,13)],olig.heroin[,c(12,14,13)])
gaba.glu <- rbind(gaba, glu)
gaba.glu.olig <- rbind(gaba.glu, olig)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
write.table(gaba.glu.olig, "ROTS_input_all.csv", quote=F, row.names=F, col.names=F, sep=";")
write.table(gaba, "ROTS_input_gaba.csv", quote=F, row.names=F, col.names=F, sep=";")
write.table(glu, "ROTS_input_glu.csv", quote=F, row.names=F, col.names=F, sep=";")
write.table(olig, "ROTS_input_olig.csv", quote=F, row.names=F, col.names=F, sep=";")
## run on macs2 peaks (diffbind consensus)
#### make sure to install all R packages within conda environment before submitting (ROTS, GenomicRanges, rtracklayer, Rsubread, Rsamtools, ade4, made4)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts
sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/Rscriptandes.sh
** need to run separately on command line for each... has to have the file name ROTS_input.csv
Object_holder2 <- differentialCall(Object_holder, B=100, K=floor(nrow(Object_holder$Filtered_NormCounts)/2), seed = 14,paired = FALSE, normalized = TRUE)
save(Object_holder2, file = "object_lower1_pfalse_genrich_glu.RData")
results <-outputGeneration(Object_holder2,fdr=1)
write.table(results,"myresults_lower1_pfalse_genrich_glu.bed",quote = FALSE,row.names = FALSE)
save(Object_holder2,file = "object_lower1_pfalse2_genrich_glu.RData")
Object_holder2 <- differentialCall(Object_holder, B=100, K=floor(nrow(Object_holder$Filtered_NormCounts)/2), seed = 14,paired = FALSE, normalized = TRUE)
save(Object_holder2, file = "object_lower1_pfalse_genrich_olig.RData")
results <-outputGeneration(Object_holder2,fdr=1)
write.table(results,"myresults_lower1_pfalse_genrich_olig.bed",quote = FALSE,row.names = FALSE)
save(Object_holder2,file = "object_lower1_pfalse2_genrich_olig.RData")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
sort -k 1,1 -k 4,4n ref/GCF_000001405.39_GRCh38.p13_genomic.gene.gtf > ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf
closestBed -D b -id -b ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a diffbind.MACS2.consensus.glu.control.bed > diffbind.MACS2.consensus.glu.control.closestGene.bed
closestBed -D b -id -b ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a diffbind.MACS2.consensus.glu.heroin.bed > diffbind.MACS2.consensus.glu.heroin.closestGene.bed
closestBed -D b -id -b ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a diffbind.MACS2.consensus.gaba.control.bed > diffbind.MACS2.consensus.gaba.control.closestGene.bed
closestBed -D b -id -b ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a diffbind.MACS2.consensus.gaba.heroin.bed > diffbind.MACS2.consensus.gaba.heroin.closestGene.bed
closestBed -D b -id -b ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a diffbind.MACS2.consensus.olig.control.bed > diffbind.MACS2.consensus.olig.control.closestGene.bed
closestBed -D b -id -b ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a diffbind.MACS2.consensus.olig.heroin.bed > diffbind.MACS2.consensus.olig.heroin.closestGene.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
gaba.TMM <- read.delim("Gabba_edgeR_TMM_normalized_counts.csv", header=T, row.names=1, sep=",", stringsAsFactors = F)
glu.TMM <- read.delim("Glu_edgeR_TMM_normalized_counts.csv", header=T, row.names=1, sep=",")
olig.TMM <- read.delim("Olig_edgeR_TMM_normalized_counts.csv", header=T, row.names=1, sep=",")
# heroin=1,8:11,13:14,16:17,19:25,27:28,30,32:36,38:39,42,44,48:49
# control=2:7,12,15,18,26,29,31,37,40:41,43,45:47,50:60
gaba.TMM <- as.matrix(gaba.TMM)
glu.TMM <- as.matrix(glu.TMM)
olig.TMM <- as.matrix(olig.TMM)
glu.TMM.control <- glu.TMM[,c(2:7,12,15,18,26,29,31,37,40:41,43,45:47,50:60)]
glu.TMM.opioid <- glu.TMM[,c(1,8:11,13:14,16:17,19:25,27:28,30,32:36,38:39,42,44,48:49)]
gaba.TMM.control <- gaba.TMM[,c(2:7,12,15,18,26,29,31,37,40:41,43,45:47,50:60)]
gaba.TMM.opioid <- gaba.TMM[,c(1,8:11,13:14,16:17,19:25,27:28,30,32:36,38:39,42,44,48:49)]
olig.TMM.control <- olig.TMM[,c(2:7,12,15,18,26,29,31,37,40:41,43,45:47,50:60)]
olig.TMM.opioid <- olig.TMM[,c(1,8:11,13:14,16:17,19:25,27:28,30,32:36,38:39,42,44,48:49)]
glu.TMM.control.mean <- data.frame(TMM.mean=rowMeans(glu.TMM.control, na.rm = FALSE, dims = 1))
glu.TMM.opioid.mean <- data.frame(TMM.mean=rowMeans(glu.TMM.opioid, na.rm = FALSE, dims = 1))
gaba.TMM.control.mean <- data.frame(TMM.mean=rowMeans(gaba.TMM.control, na.rm = FALSE, dims = 1))
gaba.TMM.opioid.mean <- data.frame(TMM.mean=rowMeans(gaba.TMM.opioid, na.rm = FALSE, dims = 1))
olig.TMM.control.mean <- data.frame(TMM.mean=rowMeans(olig.TMM.control, na.rm = FALSE, dims = 1))
olig.TMM.opioid.mean <- data.frame(TMM.mean=rowMeans(olig.TMM.opioid, na.rm = FALSE, dims = 1))
glu.TMM.control.mean$gid <- rownames(glu.TMM.control.mean)
glu.TMM.opioid.mean$gid <- rownames(glu.TMM.opioid.mean)
gaba.TMM.control.mean$gid <- rownames(gaba.TMM.control.mean)
gaba.TMM.opioid.mean$gid <- rownames(gaba.TMM.opioid.mean)
olig.TMM.control.mean$gid <- rownames(olig.TMM.control.mean)
olig.TMM.opioid.mean$gid <- rownames(olig.TMM.opioid.mean)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.closestGene.bed", header=F, sep="\t")
glu.atac.opioid <- read.delim("diffbind.MACS2.consensus.glu.heroin.closestGene.bed", header=F, sep="\t")
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.closestGene.bed", header=F, sep="\t")
gaba.atac.opioid <- read.delim("diffbind.MACS2.consensus.gaba.heroin.closestGene.bed", header=F, sep="\t")
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.closestGene.bed", header=F, sep="\t")
olig.atac.opioid <- read.delim("diffbind.MACS2.consensus.olig.heroin.closestGene.bed", header=F, sep="\t")
library(tidyr)
glu.atac.control.gene <- separate(glu.atac.control, V13, c("gene"), sep=";")
glu.atac.control.gid <- separate(glu.atac.control.gene, gene, c("gene", "gid"), sep=" ")
glu.atac.opioid.gene <- separate(glu.atac.opioid, V13, c("gene"), sep=";")
glu.atac.opioid.gid <- separate(glu.atac.opioid.gene, gene, c("gene", "gid"), sep=" ")
gaba.atac.control.gene <- separate(gaba.atac.control, V13, c("gene"), sep=";")
gaba.atac.control.gid <- separate(gaba.atac.control.gene, gene, c("gene", "gid"), sep=" ")
gaba.atac.opioid.gene <- separate(gaba.atac.opioid, V13, c("gene"), sep=";")
gaba.atac.opioid.gid <- separate(gaba.atac.opioid.gene, gene, c("gene", "gid"), sep=" ")
olig.atac.control.gene <- separate(olig.atac.control, V13, c("gene"), sep=";")
olig.atac.control.gid <- separate(olig.atac.control.gene, gene, c("gene", "gid"), sep=" ")
olig.atac.opioid.gene <- separate(olig.atac.opioid, V13, c("gene"), sep=";")
olig.atac.opioid.gid <- separate(olig.atac.opioid.gene, gene, c("gene", "gid"), sep=" ")
library(dplyr)
library(ggplot2)
library(RColorBrewer)
cols <- rev(brewer.pal(11, 'RdYlBu'))
glu.atac.control.gid.tmm <- left_join(glu.atac.control.gid[,c(1:4,14,15)], glu.TMM.control.mean, by="gid")
cor(glu.atac.control.gid.tmm[,c(4,7)], use = "complete.obs")
# 0.02936271
pdf("glu.atac.control.gid.tmm.pdf")
ggplot(glu.atac.control.gid.tmm, aes(x=V4, y=TMM.mean, color=V14)) + geom_point() + scale_colour_gradientn(colours = cols) + theme_classic()
dev.off()
glu.atac.control.gid.tmm.promoter <- subset(glu.atac.control.gid.tmm, abs(glu.atac.control.gid.tmm$V14) < 1000)
cor(glu.atac.control.gid.tmm.promoter[,c(4,7)], use = "complete.obs")
# -0.009595311
glu.atac.control.gid.tmm.genic <- subset(glu.atac.control.gid.tmm, abs(glu.atac.control.gid.tmm$V14) == 0)
cor(glu.atac.control.gid.tmm.genic[,c(4,7)], use = "complete.obs")
# -0.01080707
glu.atac.opioid.gid.tmm <- left_join(glu.atac.opioid.gid[,c(1:4,14,15)], glu.TMM.opioid.mean, by="gid")
cor(glu.atac.opioid.gid.tmm[,c(4,7)], use = "complete.obs")
# 0.01705751
gaba.atac.control.gid.tmm <- left_join(gaba.atac.control.gid[,c(1:4,14,15)], gaba.TMM.control.mean, by="gid")
cor(glu.atac.control.gid.tmm[,c(4,7)], use = "complete.obs")
# 0.02936271
gaba.atac.opioid.gid.tmm <- left_join(gaba.atac.opioid.gid[,c(1:4,14,15)], gaba.TMM.opioid.mean, by="gid")
cor(gaba.atac.opioid.gid.tmm[,c(4,7)], use = "complete.obs")
# 0.02819406
olig.atac.control.gid.tmm <- left_join(olig.atac.control.gid[,c(1:4,14,15)], olig.TMM.control.mean, by="gid")
cor(olig.atac.control.gid.tmm[,c(4,7)], use = "complete.obs")
# 0.0942542
olig.atac.opioid.gid.tmm <- left_join(olig.atac.opioid.gid[,c(1:4,14,15)], olig.TMM.opioid.mean, by="gid")
cor(olig.atac.opioid.gid.tmm[,c(4,7)], use = "complete.obs")
# 0.0851718
#### Do we really expect to see a correlation here? The ATAC peaks are present/absent more than they are quantitative for their effect...
# ADAMTS1 [NC_000021.9:26835755-26845409]
## ATAC peak [NC_000021.9:26844572-26845987]
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J ADAMTS1
#SBATCH -N 2
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
R CMD BATCH ADAMTS1.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ADAMTS1.sh
# salloc -A SYB105 -p gpu -N 1 -t 2:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
library(dplyr)
library(tidyr)
library(ggplot2)
# counts matrix file
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
# cts.in <- read.table("bin.matrix.txt", header=T, sep="\t", stringsAsFactors = F)
# cts.id <- unite(cts.in, "id", c(chr, start, end), sep="_", remove=FALSE)
# #cts <- as.matrix(cts.id, row.names="id")
# cts <- t(cts.id[,c(1,5:ncol(cts.id))])
# write.table(cts, "bin.matrix.transform.txt", quote=F, row.names=T, col.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
cts <- read.delim("bin.matrix.transform.txt", header=T, sep="\t", stringsAsFactors = F, row.names=1)
colnames(cts) <- cts[1,]
d <- as.matrix(cts[2:nrow(cts)])
d.num <- as.numeric(d)
# sample / condition file
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
sample <- read.delim("atac.csaw.key1.rmdups.txt", header=T, sep="\t", stringsAsFactors = F)
sample.id <- separate(sample, sample.name, c("sample", "seq"), sep="_")
coldata <- sample.id[,c(1,3)]
rownames(coldata) <- coldata$sample
coldata$condition <- factor(coldata$condition)
rownames(coldata) <- rownames(coldata)
all(rownames(coldata) %in% colnames(cts.id))
cts <- cts[, rownames(coldata)]
all(rownames(coldata) == colnames(cts))
cts.adam <- subset(cts, cts$chr == "NC_000021.9" & cts$start >= 26844572 & cts$end <= 26845987)
cts.adam.olig <- susbet(cts, cts$group == "OLIG")
cts.adam.mean <- cts.adam.olig %>% group_by(id) %>% mutate(mean.atac = mean())
pdf("ADAMTS1.atac.cpm.violin.pdf")
ggplot(cts.adam.mean, aes(x=condition, y=mean.atac, fill=condition)) + geom_violin() + theme_classic()
dev.off()
–> taking too long to process full matrix… cut in awk and make plot
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins
awk '{if ($1 == "NC_000021.9" && $2 >= 26844572 && $3 <= 26845987) print $0}' bin.matrix.txt > bin.matrix.ADAMTS1.txt
awk 'NR==1{print $0}' bin.matrix.txt > bin.matrix.header.txt
cat bin.matrix.header.txt bin.matrix.ADAMTS1.txt > bin.matrix.ADAMTS1.header.txt
# R
library(tidyr)
library(stringr)
library(reshape2)
library(ggplot2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
adamts1 <- read.delim("bin.matrix.ADAMTS1.header.txt", header=T, sep="\t", stringsAsFactors = F)
# sample / condition file
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
sample <- read.delim("atac.csaw.key1.rmdups.txt", header=T, sep="\t", stringsAsFactors = F)
sample.id <- separate(sample, sample.name, c("sample", "seq"), sep="_")
# olig control = 31:45,48,51,54,57,60,63,66,69,72,75,78,81,84,87,90
# olig heroin = 121:135,138,141,145,147,150,153,156,159,162,165,168,173,176,179
adamts1.id <- adamts1[, 4:182]
adamts1.id.t <- data.frame(t(adamts1.id))
adamts1.id.num <- data.matrix(adamts1.id.t)
adamts1.olig.control <- adamts1.id.num[c(31:45,48,51,54,57,60,63,66,69,72,75,78,81,84,87,90),]
adamts1.olig.heroin <- adamts1.id.num[c(121:135,138,141,145,147,150,153,156,159,162,165,168,173,176,179),]
adamts1.olig.control.mean <- data.frame(sample.mean = rowMeans(adamts1.olig.control, na.rm = FALSE, dims = 1))
adamts1.olig.heroin.mean <- data.frame(sample.mean = rowMeans(adamts1.olig.heroin, na.rm = FALSE, dims = 1))
adamts1.olig.control.mean$condition <- "Control"
adamts1.olig.heroin.mean$condition <- "Heroin"
adamts1.olig.mean <- rbind(adamts1.olig.control.mean, adamts1.olig.heroin.mean)
library(reshape2)
pdf("ADAMTS1.atac.cpm.violin.pdf")
ggplot(adamts1.olig.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_violin() + theme_classic()
dev.off()
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins
awk '{if ($1 == "NC_000007.14" && $2 >= 101162509 && $3 <= 101169956) print $0}' bin.matrix.txt > bin.matrix.VGF.txt
cat bin.matrix.header.txt bin.matrix.VGF.txt > bin.matrix.VGF.header.txt
awk '{if ($1 == "NC_000005.10" && $2 >= 96390333 && $3 <= 96433248) print $0}' bin.matrix.txt > bin.matrix.PCSK1.txt
cat bin.matrix.header.txt bin.matrix.PCSK1.txt > bin.matrix.PCSK1.header.txt
# R
library(tidyr)
library(stringr)
library(reshape2)
library(ggplot2)
# sample / condition file
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
sample <- read.delim("atac.csaw.key1.rmdups.txt", header=T, sep="\t", stringsAsFactors = F)
sample.id <- separate(sample, sample.name, c("sample", "seq"), sep="_")
# gaba control = 1:15,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88
# gaba heroin = 91:100,136,139,142,145,148,151,154,157,160,163,166,169,171,174,177
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
vgf <- read.delim("bin.matrix.VGF.header.txt", header=T, sep="\t", stringsAsFactors = F)
pcsk1 <- read.delim("bin.matrix.PCSK1.header.txt", header=T, sep="\t", stringsAsFactors = F)
vgf.id <- vgf[,4:182]
vgf.id.t <- data.frame(t(vgf.id))
vgf.id.num <- data.matrix(vgf.id.t)
vgf.gaba.control <- vgf.id.num[c(1:15,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88),]
vgf.gaba.heroin <- vgf.id.num[c(91:100,136,139,142,145,148,151,154,157,160,163,166,169,171,174,177),]
vgf.gaba.control.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.control, na.rm = FALSE, dims = 1))
vgf.gaba.heroin.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.heroin, na.rm = FALSE, dims = 1))
vgf.gaba.control.mean$condition <- "Control"
vgf.gaba.heroin.mean$condition <- "Heroin"
vgf.gaba.mean <- rbind(vgf.gaba.control.mean, vgf.gaba.heroin.mean)
library(reshape2)
pdf("VGF.atac.cpm.violin.pdf")
ggplot(vgf.gaba.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_violin() + theme_classic() + scale_fill_manual(values=c("#999999", "#E69F00"))
dev.off()
pcsk1.id <- pcsk1[, 4:182]
pcsk1.id.t <- data.frame(t(pcsk1.id))
pcsk1.id.num <- data.matrix(pcsk1.id.t)
pcsk1.gaba.control <- pcsk1.id.num[c(1:15,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88),]
pcsk1.gaba.heroin <- pcsk1.id.num[c(91:100,136,139,142,145,148,151,154,157,160,163,166,169,171,174,177),]
pcsk1.gaba.control.mean <- data.frame(sample.mean = rowMeans(pcsk1.gaba.control, na.rm = FALSE, dims = 1))
pcsk1.gaba.heroin.mean <- data.frame(sample.mean = rowMeans(pcsk1.gaba.heroin, na.rm = FALSE, dims = 1))
pcsk1.gaba.control.mean$condition <- "Control"
pcsk1.gaba.heroin.mean$condition <- "Heroin"
pcsk1.gaba.mean <- rbind(pcsk1.gaba.control.mean, pcsk1.gaba.heroin.mean)
library(reshape2)
pdf("PCSK1.atac.cpm.violin.pdf")
ggplot(pcsk1.gaba.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_violin() + theme_classic()
dev.off()
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# shuffle
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
bedtools shuffle -noOverlapping -seed 2458 -i diffbind.MACS2.consensus.gaba.control.bed -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size > diffbind.MACS2.consensus.gaba.control.shuffle.bed
bedtools shuffle -noOverlapping -seed 2458 -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -i diffbind.MACS2.consensus.gaba.heroin.bed > diffbind.MACS2.consensus.gaba.heroin.shuffle.bed
bedtools shuffle -noOverlapping -seed 2458 -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -i diffbind.MACS2.consensus.glu.control.bed > diffbind.MACS2.consensus.glu.control.shuffle.bed
bedtools shuffle -noOverlapping -seed 2458 -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -i diffbind.MACS2.consensus.glu.heroin.bed > diffbind.MACS2.consensus.glu.heroin.shuffle.bed
bedtools shuffle -noOverlapping -seed 2458 -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -i diffbind.MACS2.consensus.olig.control.bed > diffbind.MACS2.consensus.olig.control.shuffle.bed
bedtools shuffle -noOverlapping -seed 2458 -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size -i diffbind.MACS2.consensus.olig.heroin.bed > diffbind.MACS2.consensus.olig.heroin.shuffle.bed
# ROTS
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba/code
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu/code
mkdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig/code
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/Rscriptandes.sh /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba/Rscriptandes.sh
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/Rscriptandes.sh /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu/Rscriptandes.sh
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/Rscriptandes.sh /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig/Rscriptandes.sh
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/code/* /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba/code/.
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/code/* /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu/code/.
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/code/* /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig/code/.
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/ROTS_backbone_base_1.0.5.R /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba/.
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/ROTS_backbone_base_1.0.5.R /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu/.
cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/ROTS_backbone_base_1.0.5.R /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig/.
# make ROTS_input.csv file with consensus peaks and bam directories
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
# df <- read.delim("diffbind.macs2.outlier.sample.csv", header=T, sep=",")
#
# gaba.control <- subset(df, df$Condition == "control" & df$Tissue == "gaba")
# gaba.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "gaba")
# glu.control <- subset(df, df$Condition == "control" & df$Tissue == "glu")
# glu.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "glu")
# olig.control <- subset(df, df$Condition == "control" & df$Tissue == "olig")
# olig.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "olig")
#
# gaba.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.control.shuffle.bed"
# gaba.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.heroin.shuffle.bed"
# glu.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.glu.control.shuffle.bed"
# glu.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.glu.heroin.shuffle.bed"
# olig.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.olig.control.shuffle.bed"
# olig.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.olig.heroin.shuffle.bed"
#
# gaba.control$rep <- 1
# gaba.heroin$rep <- 2
# glu.control$rep <- 1
# glu.heroin$rep <- 2
# olig.control$rep <- 1
# olig.heroin$rep <- 2
#
# gaba.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.control$bamReads)
# gaba.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.heroin$bamReads)
# glu.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", glu.control$bamReads)
# glu.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", glu.heroin$bamReads)
# olig.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", olig.control$bamReads)
# olig.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", olig.heroin$bamReads)
#
# gaba <- rbind(gaba.control[,c(12,14,13)],gaba.heroin[,c(12,14,13)])
# glu <- rbind(glu.control[,c(12,14,13)],glu.heroin[,c(12,14,13)])
# olig <- rbind(olig.control[,c(12,14,13)],olig.heroin[,c(12,14,13)])
#
# gaba.glu <- rbind(gaba, glu)
# gaba.glu.olig <- rbind(gaba.glu, olig)
#
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
# write.table(gaba.glu.olig, "ROTS_input.csv", quote=F, row.names=F, col.names=F, sep=";")
#
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba")
# write.table(gaba, "ROTS_input.csv", quote=F, row.names=F, col.names=F, sep=";")
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu")
# write.table(glu, "ROTS_input.csv", quote=F, row.names=F, col.names=F, sep=";")
# setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig")
# write.table(olig, "ROTS_input.csv", quote=F, row.names=F, col.names=F, sep=";")
#
#
# ## run the scripts individually on the command line
# # salloc -A SYB105 -p gpu -N 1 -t 2:00:00
# source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# R
#
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/gaba/Rscriptandes.sh
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/glu/Rscriptandes.sh
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/shuffle/olig/Rscriptandes.sh
–> can’t do ROTS with the randomized peaks because the “consensus” doesn’t make sense for the read files input –> just call differential as present in one, absent in the other (control/heroin) and do the same for real peaks to compare
library(tidygenomics)
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
# gaba
gaba.control <- read.delim("diffbind.MACS2.consensus.gaba.control.bed", header=F, sep="\t")
gaba.control.shuffle <- read.delim("diffbind.MACS2.consensus.gaba.control.shuffle.bed", header=F, sep="\t")
gaba.heroin <- read.delim("diffbind.MACS2.consensus.gaba.heroin.bed", header=F, sep="\t")
gaba.heroin.shuffle <- read.delim("diffbind.MACS2.consensus.gaba.heroin.shuffle.bed", header=F, sep="\t")
gaba.control$gaba.control.ID <- seq.int(nrow(gaba.control))
gaba.control.shuffle$gaba.control.shuffle.ID <- seq.int(nrow(gaba.control.shuffle))
gaba.heroin$gaba.heroin.ID <- seq.int(nrow(gaba.heroin))
gaba.heroin.shuffle$gaba.heroin.shuffle.ID <- seq.int(nrow(gaba.heroin.shuffle))
gaba.control.heroin.intersect <- genome_intersect(gaba.control, gaba.heroin, by=c("V1", "V2", "V3"))
gaba.control.differential <- subset(gaba.control, !(gaba.control$gaba.control.ID %in% gaba.control.heroin.intersect$gaba.control.ID))
gaba.heroin.differential <- subset(gaba.heroin, !(gaba.heroin$gaba.heroin.ID %in% gaba.control.heroin.intersect$gaba.heroin.ID))
nrow(gaba.control.differential)
# 7319 / 51337 = 0.1425677
nrow(gaba.heroin.differential)
# 3169 / 47329 = 0.06695683
gaba.control.heroin.shuffle.intersect <- genome_intersect(gaba.control.shuffle, gaba.heroin.shuffle, by=c("V1", "V2", "V3"))
gaba.control.shuffle.differential <- subset(gaba.control.shuffle, !(gaba.control.shuffle$gaba.control.shuffle.ID %in% gaba.control.heroin.shuffle.intersect$gaba.control.shuffle.ID))
gaba.heroin.shuffle.differential <- subset(gaba.heroin.shuffle, !(gaba.heroin.shuffle$gaba.heroin.shuffle.ID %in% gaba.control.heroin.shuffle.intersect$gaba.heroin.shuffle.ID))
nrow(gaba.control.shuffle.differential)
# 4092 / 51337 = 0.07970859
nrow(gaba.heroin.shuffle.differential)
# 86 / 47329 = 0.001817068
gaba.control.heroin.differential <- rbind(gaba.control.differential[,1:4], gaba.heroin.differential[,1:4])
gaba.control.heroin.shuffle.differential <- rbind(gaba.control.shuffle.differential[,1:4], gaba.heroin.shuffle.differential[,1:4])
# glu
glu.control <- read.delim("diffbind.MACS2.consensus.glu.control.bed", header=F, sep="\t")
glu.control.shuffle <- read.delim("diffbind.MACS2.consensus.glu.control.shuffle.bed", header=F, sep="\t")
glu.heroin <- read.delim("diffbind.MACS2.consensus.glu.heroin.bed", header=F, sep="\t")
glu.heroin.shuffle <- read.delim("diffbind.MACS2.consensus.glu.heroin.shuffle.bed", header=F, sep="\t")
glu.control$glu.control.ID <- seq.int(nrow(glu.control))
glu.control.shuffle$glu.control.shuffle.ID <- seq.int(nrow(glu.control.shuffle))
glu.heroin$glu.heroin.ID <- seq.int(nrow(glu.heroin))
glu.heroin.shuffle$glu.heroin.shuffle.ID <- seq.int(nrow(glu.heroin.shuffle))
glu.control.heroin.intersect <- genome_intersect(glu.control, glu.heroin, by=c("V1", "V2", "V3"))
glu.control.differential <- subset(glu.control, !(glu.control$glu.control.ID %in% glu.control.heroin.intersect$glu.control.ID))
glu.heroin.differential <- subset(glu.heroin, !(glu.heroin$glu.heroin.ID %in% glu.control.heroin.intersect$glu.heroin.ID))
nrow(glu.control.differential)
# 10848 / 83915 = 0.1292737
nrow(glu.heroin.differential)
# 5706 / 78655 = 0.07254466
glu.control.heroin.shuffle.intersect <- genome_intersect(glu.control.shuffle, glu.heroin.shuffle, by=c("V1", "V2", "V3"))
glu.control.shuffle.differential <- subset(glu.control.shuffle, !(glu.control.shuffle$glu.control.shuffle.ID %in% glu.control.heroin.shuffle.intersect$glu.control.shuffle.ID))
glu.heroin.shuffle.differential <- subset(glu.heroin.shuffle, !(glu.heroin.shuffle$glu.heroin.shuffle.ID %in% glu.control.heroin.shuffle.intersect$glu.heroin.shuffle.ID))
nrow(glu.control.shuffle.differential)
# 5363 / 83915 = 0.06390991
nrow(glu.heroin.shuffle.differential)
# 214 / 78655 = 0.002720742
glu.control.heroin.differential <- rbind(glu.control.differential[,1:4], glu.heroin.differential[,1:4])
glu.control.heroin.shuffle.differential <- rbind(glu.control.shuffle.differential[,1:4], glu.heroin.shuffle.differential[,1:4])
# olig
olig.control <- read.delim("diffbind.MACS2.consensus.olig.control.bed", header=F, sep="\t")
olig.control.shuffle <- read.delim("diffbind.MACS2.consensus.olig.control.shuffle.bed", header=F, sep="\t")
olig.heroin <- read.delim("diffbind.MACS2.consensus.olig.heroin.bed", header=F, sep="\t")
olig.heroin.shuffle <- read.delim("diffbind.MACS2.consensus.olig.heroin.shuffle.bed", header=F, sep="\t")
olig.control$olig.control.ID <- seq.int(nrow(olig.control))
olig.control.shuffle$olig.control.shuffle.ID <- seq.int(nrow(olig.control.shuffle))
olig.heroin$olig.heroin.ID <- seq.int(nrow(olig.heroin))
olig.heroin.shuffle$olig.heroin.shuffle.ID <- seq.int(nrow(olig.heroin.shuffle))
olig.control.heroin.intersect <- genome_intersect(olig.control, olig.heroin, by=c("V1", "V2", "V3"))
olig.control.differential <- subset(olig.control, !(olig.control$olig.control.ID %in% olig.control.heroin.intersect$olig.control.ID))
olig.heroin.differential <- subset(olig.heroin, !(olig.heroin$olig.heroin.ID %in% olig.control.heroin.intersect$olig.heroin.ID))
nrow(olig.control.differential)
# 1164 / 38060 = 0.03058329
nrow(olig.heroin.differential)
# 8987 / 46054 = 0.1951405
olig.control.heroin.shuffle.intersect <- genome_intersect(olig.control.shuffle, olig.heroin.shuffle, by=c("V1", "V2", "V3"))
olig.control.shuffle.differential <- subset(olig.control.shuffle, !(olig.control.shuffle$olig.control.shuffle.ID %in% olig.control.heroin.shuffle.intersect$olig.control.shuffle.ID))
olig.heroin.shuffle.differential <- subset(olig.heroin.shuffle, !(olig.heroin.shuffle$olig.heroin.shuffle.ID %in% olig.control.heroin.shuffle.intersect$olig.heroin.shuffle.ID))
nrow(olig.control.shuffle.differential)
# 50 / 38060 = 0.001313715
nrow(olig.heroin.shuffle.differential)
# 8003 / 46054 = 0.1737743
olig.control.heroin.differential <- rbind(olig.control.differential[,1:4], olig.heroin.differential[,1:4])
olig.control.heroin.shuffle.differential <- rbind(olig.control.shuffle.differential[,1:4], olig.heroin.shuffle.differential[,1:4])
### near genes?
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
gene.df <- gene[,c(1,4,5,7,9)]
colnames(gene.df) <- c("V1", "V2", "V3", "strand", "ID")
gaba.differential.gene <- genome_join_closest(gaba.control.heroin.differential, gene.df, by=c("V1", "V2", "V3"), distance_column_name="distance", mode="left")
gaba.shuffle.differential.gene <- genome_join_closest(gaba.control.heroin.shuffle.differential, gene.df, by=c("V1", "V2", "V3"), distance_column_name="distance", mode="left")
glu.differential.gene <- genome_join_closest(glu.control.heroin.differential, gene.df, by=c("V1", "V2", "V3"), distance_column_name="distance", mode="left")
glu.shuffle.differential.gene <- genome_join_closest(glu.control.heroin.shuffle.differential, gene.df, by=c("V1", "V2", "V3"), distance_column_name="distance", mode="left")
olig.differential.gene <- genome_join_closest(olig.control.heroin.differential, gene.df, by=c("V1", "V2", "V3"), distance_column_name="distance", mode="left")
olig.shuffle.differential.gene <- genome_join_closest(olig.control.heroin.shuffle.differential, gene.df, by=c("V1", "V2", "V3"), distance_column_name="distance", mode="left")
### DE genes
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/rnaseq/")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
glu.int <- read.delim("GLU.DEG.txt", header=T, sep="\t")
olig.int <- read.delim("OLIG.DEG.txt", header=T, sep="\t")
gaba.differential.gene.id <- separate(gaba.differential.gene, ID, c("gene"), sep=";")
gaba.differential.gene.id2 <- separate(gaba.differential.gene.id, gene, c("gene", "Gene_Symbol"), sep=" ")
gaba.differential.gene.DE <- subset(gaba.differential.gene.id2, gaba.differential.gene.id2$Gene_Symbol %in% gaba.int$Gene_Symbol)
length(unique(gaba.differential.gene.DE$Gene_Symbol))
# 69
gaba.shuffle.differential.gene.id <- separate(gaba.shuffle.differential.gene, ID, c("gene"), sep=";")
gaba.shuffle.differential.gene.id2 <- separate(gaba.shuffle.differential.gene.id, gene, c("gene", "Gene_Symbol"), sep=" ")
gaba.shuffle.differential.gene.DE <- subset(gaba.shuffle.differential.gene.id2, gaba.shuffle.differential.gene.id2$Gene_Symbol %in% gaba.int$Gene_Symbol)
length(unique(gaba.shuffle.differential.gene.DE$Gene_Symbol))
# 22
glu.differential.gene.id <- separate(glu.differential.gene, ID, c("gene"), sep=";")
glu.differential.gene.id2 <- separate(glu.differential.gene.id, gene, c("gene", "Gene_Symbol"), sep=" ")
glu.differential.gene.DE <- subset(glu.differential.gene.id2, glu.differential.gene.id2$Gene_Symbol %in% glu.int$Gene_Symbol)
length(unique(glu.differential.gene.DE$Gene_Symbol))
# 56
glu.shuffle.differential.gene.id <- separate(glu.shuffle.differential.gene, ID, c("gene"), sep=";")
glu.shuffle.differential.gene.id2 <- separate(glu.shuffle.differential.gene.id, gene, c("gene", "Gene_Symbol"), sep=" ")
glu.shuffle.differential.gene.DE <- subset(glu.shuffle.differential.gene.id2, glu.shuffle.differential.gene.id2$Gene_Symbol %in% glu.int$Gene_Symbol)
length(unique(glu.shuffle.differential.gene.DE$Gene_Symbol))
# 25
olig.differential.gene.id <- separate(olig.differential.gene, ID, c("gene"), sep=";")
olig.differential.gene.id2 <- separate(olig.differential.gene.id, gene, c("gene", "Gene_Symbol"), sep=" ")
olig.differential.gene.DE <- subset(olig.differential.gene.id2, olig.differential.gene.id2$Gene_Symbol %in% olig.int$Gene_Symbol)
length(unique(olig.differential.gene.DE$Gene_Symbol))
# 41
olig.shuffle.differential.gene.id <- separate(olig.shuffle.differential.gene, ID, c("gene"), sep=";")
olig.shuffle.differential.gene.id2 <- separate(olig.shuffle.differential.gene.id, gene, c("gene", "Gene_Symbol"), sep=" ")
olig.shuffle.differential.gene.DE <- subset(olig.shuffle.differential.gene.id2, olig.shuffle.differential.gene.id2$Gene_Symbol %in% olig.int$Gene_Symbol)
length(unique(olig.shuffle.differential.gene.DE$Gene_Symbol))
# 15
### plot distribution of distance to genes
### closest
olig.differential.gene$cell.type <- "OLIG"
olig.differential.gene$peaks <- "ATAC"
olig.shuffle.differential.gene$cell.type <- "OLIG"
olig.shuffle.differential.gene$peaks <- "Shuffle"
gaba.differential.gene$cell.type <- "GABA"
gaba.differential.gene$peaks <- "ATAC"
gaba.shuffle.differential.gene$cell.type <- "GABA"
gaba.shuffle.differential.gene$peaks <- "Shuffle"
glu.differential.gene$cell.type <- "GLU"
glu.differential.gene$peaks <- "ATAC"
glu.shuffle.differential.gene$cell.type <- "GLU"
glu.shuffle.differential.gene$peaks <- "Shuffle"
differential.gene <- rbind(olig.differential.gene, gaba.differential.gene, glu.differential.gene)
shuffle.differential.gene <- rbind(olig.shuffle.differential.gene, gaba.shuffle.differential.gene, glu.shuffle.differential.gene)
all.differential.gene <- rbind(differential.gene, shuffle.differential.gene)
library(ggplot2)
pdf("atac.shuffle.differential.closestGene.distribution.pdf")
ggplot(all.differential.gene, aes(x=distance, color=peaks))) + geom_density() + theme_classic() + facet_grid(peaks ~ cell.type)
dev.off()
### DE
olig.differential.gene.DE$cell.type <- "OLIG"
olig.differential.gene.DE$peaks <- "ATAC"
olig.shuffle.differential.gene.DE$cell.type <- "OLIG"
olig.shuffle.differential.gene.DE$peaks <- "Shuffle"
gaba.differential.gene.DE$cell.type <- "GABA"
gaba.differential.gene.DE$peaks <- "ATAC"
gaba.shuffle.differential.gene.DE$cell.type <- "GABA"
gaba.shuffle.differential.gene.DE$peaks <- "Shuffle"
glu.differential.gene.DE$cell.type <- "GLU"
glu.differential.gene.DE$peaks <- "ATAC"
glu.shuffle.differential.gene.DE$cell.type <- "GLU"
glu.shuffle.differential.gene.DE$peaks <- "Shuffle"
differential.DEgene <- rbind(olig.differential.gene.DE, gaba.differential.gene.DE, glu.differential.gene.DE)
shuffle.differential.DEgene <- rbind(olig.shuffle.differential.gene.DE, gaba.shuffle.differential.gene.DE, glu.shuffle.differential.gene.DE)
all.differential.DEgene <- rbind(differential.DEgene, shuffle.differential.DEgene)
pdf("atac.shuffle.differential.closestGene.DE.distribution.pdf")
ggplot(all.differential.DEgene, aes(x=distance, color=peaks)) + geom_density() + theme_classic() + facet_grid(cell.type ~ .)
dev.off()
pdf("atac.shuffle.differential.closestGene.DE.histogram.pdf")
ggplot(all.differential.DEgene, aes(x=distance, color=peaks)) + geom_histogram() + theme_classic() + facet_grid(cell.type ~ .)
dev.off()
–> overlap with ChIP-seq peaks
library(tidygenomics)
library(dplyr)
library(tidyr)
## All peaks
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Chipseq/")
olig.chip <- read.delim("olig.merged.peaks.bed", header=F, sep="\t")
glu.chip <- read.delim("glu.merged.peaks.bed", header=F, sep="\t")
gaba.chip <- read.delim("gaba.merged.peaks.bed", header=F, sep="\t")
#mkdir /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/Shuffle
#cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/Shuffle
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.control.shuffle.bed .
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.glu.control.shuffle.bed .
#scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.olig.control.shuffle.bed .
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/Shuffle")
glu.atac.control <- read.delim("diffbind.MACS2.consensus.glu.control.shuffle.bed", header=F, sep="\t")
# 83915
gaba.atac.control <- read.delim("diffbind.MACS2.consensus.gaba.control.shuffle.bed", header=F, sep="\t")
# 51337
olig.atac.control <- read.delim("diffbind.MACS2.consensus.olig.control.shuffle.bed", header=F, sep="\t")
# 38060
gaba.atac.control.chip <- genome_intersect(gaba.atac.control, gaba.chip, by=c("V1", "V2", "V3"))
length(unique(gaba.atac.control.chip$V4.x))
# 4479 / 51337 = 0.08724701 shuffled peaks (0.8395894 real atac peaks)
glu.atac.control.chip <- genome_intersect(glu.atac.control, glu.chip, by=c("V1", "V2", "V3"))
length(unique(glu.atac.control.chip$V4.x))
# 8231 / 83915 = 0.09808735 shuffled peaks (0.8203778 real atac peaks)
olig.atac.control.chip <- genome_intersect(olig.atac.control, olig.chip, by=c("V1", "V2", "V3"))
length(unique(olig.atac.control.chip$V4.x))
# 2863 / 38060 = 0.07522333 shuffled peaks (0.8080925 real atac peaks)
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J csaw
#SBATCH -N 2
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate csaw
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
R CMD BATCH csaw.R
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/csaw.sh
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda create --name csaw python=3.8
conda activate csaw
conda install -c conda-forge -c bioconda bioconductor-csaw
library(csaw)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
options(stringsAsFactors=FALSE);
fsample = "key.files/diffbind.macs2.outlier.sample.rotsscripts.csv";
samples = read.csv(fsample,header=TRUE);
files = samples$bamReads;
pe.bam = samples$bamReads;
frag.lens=c();
for (i in 1:length(pe.bam)){
out = getPESizes(pe.bam[i]);
frag.lens=c(frag.lens,mean(out$sizes));
frag.sizes <- out$sizes[out$sizes<=2000];
hist(frag.sizes, breaks=50, xlab="Fragment sizes (bp)",ylab="Frequency", main="", col="grey80");
abline(v=1500, col="red");
}
dev.off();
save(frag.lens, file="frag.lens.Rdata");
load("frag.lens.Rdata");
pe.param = readParam(max.frag=1500, pe="both",minq=20);
#framgment length
multi.frag.len=list(frag.lens,NA);
#binding site length
win.width=10;
#spacing should not be larger than ext/2 for analyses with small windows. If ext is also very small, spacing should be set to width to avoid loading too many small windows.
data = windowCounts(pe.bam, ext=multi.frag.len, filter=700, spacing=win.width, param=pe.param, width=win.width);
merged <- mergeWindows(rowRanges(data), tol=100L);
my.regions = merged$region;
save(my.regions,file="regions.Rdata");
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J bam.merge
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# merge all bam files to run macs2 for ABC?
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam
# ls *.bam > files.bamlist
# grep 'GLU' files.bamlist > files.glu.bamlist
# grep 'GABA' files.bamlist > files.gaba.bamlist
# grep 'OLIG' files.bamlist > files.olig.bamlist
bamtools merge -list files.gaba.bamlist -out rmdups.uniq.gaba.merged.bam
bamtools merge -list files.glu.bamlist -out rmdups.uniq.glu.merged.bam
bamtools merge -list files.olig.bamlist -out rmdups.uniq.olig.merged.bam
# bamtools merge -list files.bamlist -out rmdups.uniq.merged.bam
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools sort -l 1 -m 7G -o rmdups.uniq.gaba.merged.sorted.bam -O bam -@ 32 rmdups.uniq.gaba.merged.bam
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/samtools/samtools index -@ 32 rmdups.uniq.gaba.merged.sorted.bam
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/bam.merge.sh
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J macs2
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
# call peaks
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
macs2 callpeak \
-t /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
-n gaba.rmdups.uniq.merged.macs2 \
-f BAM \
-g hs \
-p .1 \
--nomodel \
--extsize 147 \
--call-summits \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/macs2.sh
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J bam.merge
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 48:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/ABC
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction
# Step 1: call candidate regions
python src/makeCandidateRegions.py \
--narrowPeak /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted \
--bam /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.sorted.bam \
--outDir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/ \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--regions_blocklist /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/hg38-blacklist.v2.ensembl.bed \
--regions_includelist /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.TSS500bp.bed \
--peakExtendFromSummit 250 \
--nStrongestPeaks 150000
# cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size.bed
# cp /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/hg38-blacklist.v2.bed /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/hg38-blacklist.v2.ensembl.bed
# bedtools sort -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted.rmdups.uniq.gaba.merged.bam.Counts.bed -faidx /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n 150000 |bedtools intersect -b stdin -a /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted -wa |awk '{print $1 "\t" $2 + $10 "\t" $2 + $10}' |bedtools slop -i stdin -b 250 -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size |bedtools sort -i stdin -faidx /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size |bedtools merge -i stdin | bedtools intersect -v -wa -a stdin -b /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/hg38-blacklist.v2.ensembl.bed | cut -f 1-3 | (bedtools intersect -a /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.TSS500bp.bed -b /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size.bed -wa | cut -f 1-3 && cat) |bedtools sort -i stdin -faidx /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size | bedtools merge -i stdin > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted.candidateRegions.bed
# Step 2: quantifying enhancer activity
# awk '{print $1"\t"0"\t"$2}' /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size.bed
python src/run.neighborhoods.py \
--candidate_enhancer_regions /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted.candidateRegions.bed \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.collabsed.bed \
--H3K27ac /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_BAM/mrege.SOX.sorted.bam \
--ATAC /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.sorted.bam \
--expression_table /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/Gaba_ABC_meanTPM.txt \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--ubiquitously_expressed_genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/All_ABC_constitutive.txt \
--cellType gaba \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/test
# Assigning classes to enhancers
# Total enhancers: 185578
# Promoters: 22922
# Genic: 113221
# Intergenic: 49435
# Step 3: computing ABC score
# If experimentally derived contact data is not available, one can run the ABC model using the powerlaw estimate only. In this case the --HiCdir argument should be excluded from predict.py and the --score_column powerlaw.Score argument should be included in predict.py. In this case the ABC.Score column of the predictions file will be set to NaN. The powerlaw.Score column of the output prediction files will be the relevant Score column to use.
python src/predict.py \
--enhancers /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/test/EnhancerList.txt \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/test/GeneList.txt \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--score_column powerlaw.Score \
--hic_resolution 5000 \
--scale_hic_using_powerlaw \
--threshold .02 \
--cellType gaba \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/test/ \
--make_all_putative
# Step 4: get prediction files for variant overlap
python src/getVariantOverlap.py \
--all_putative /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/test/EnhancerPredictionsAllPutative.txt.gz \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/test/
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/abc.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
git clone https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction.git
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
## pre-run macs peak files: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks
# call peaks
# conda env create --name abc.macs -f macs.yml
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
macs2 callpeak \
-t /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
-n gaba.rmdups.uniq.merged.macs2 \
-f BAM \
-g hs \
-p .1 \
--call-summits \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
# WARNING @ Wed, 08 Dec 2021 13:27:47: Too few paired peaks (33) so I can not build the model! Broader your MFOLD range parameter may erase this error. If it still can't build the model, we suggest to use --nomodel and --extsize 147 or other fixed number instead.
# WARNING @ Wed, 08 Dec 2021 13:27:47: Process for pairing-model is terminated!
macs2 callpeak \
-t /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
-n gaba.rmdups.uniq.merged.macs2 \
-f BAM \
-g hs \
-p .1 \
--nomodel \
--extsize 147 \
--call-summits \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
# sort narrow peaks file
bedtools sort -faidx /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size.bed -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted
# Step 1: call candidate regions
# make TSS bed file
R
library(dplyr)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/")
df <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf", header=F, sep="\t")
df.tss <- df %>% mutate(chr = V1, start = ifelse(V7 == "+", V4 - 250, V5 - 250), end = ifelse(V7 == "+", V4 + 250, V5 + 250))
df.tss.sub <- df.tss[,c(10:12,9,7)]
df.id <- separate(df.tss.sub, V9, c("gene_id"), sep=";")
df.id2 <- separate(df.id, gene_id, c("type", "gid"), sep=" ")
df.print <- df.id2[,c(1:3,5,6)]
write.table(df.print, "GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.TSS500bp.bed", quote=F, row.names=F, col.names=F, sep="\t")
df.id <- separate(df, V9, c("gene_id"), sep=";")
df.id2 <- separate(df.id, gene_id, c("type", "gid"), sep=" ")
df.id2$zero <- 0
df.print <- df.id2[,c(1,4,5,10,11,7)]
write.table(df.print, "GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.collabsed.bed", quote=F, row.names=F, col.names=F, sep="\t")
df.genelist <- data.frame(df.id2[,10])
write.table(df.genelist, "GCF_000001405.39_GRCh38.p13_genomic.genelist.bed", quote=F, row.names=F, col.names=F, sep="\t")
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda env create -f /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction/abcenv.yml
# conda activate final-abc-env
# conda env list
# conda create --name ABC python=3.6.4
conda activate ABC
# conda install -c conda-forge -c bioconda samtools bedtools Tabix MACS2 Java Juicer
# conda install -c conda-forge -c bioconda pyranges numpy pandas scipy pysam pyBigWig
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction
samtools index /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bai
python src/makeCandidateRegions.py \
--narrowPeak /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted \
--bam /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
--outDir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/ \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--regions_blocklist /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/hg38-blacklist.v2.ensembl.bed \
--regions_includelist /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.TSS500bp.bed \
--peakExtendFromSummit 250 \
--nStrongestPeaks 150000
# Step 2: quantifying enhancer activity
## TPM files here: /gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
df.gaba <- read.delim("Gabba_heroin_vs_control_averageTPM.csv", header=T, sep=",", stringsAsFactors = F)
df.gaba.mean <- df.gaba %>% mutate(mean = (df.gaba$Control + df.gaba$Heroin) / 2)
df.glu <- read.delim("Glu_heroin_vs_control_averageTPM.csv", header=T, sep=",", stringsAsFactors = F)
df.glu.mean <- df.glu %>% mutate(mean = (df.glu$Control + df.glu$Heroin) / 2)
df.olig <- read.delim("Olig_heroin_vs_control_averageTPM.csv", header=T, sep=",", stringsAsFactors = F)
df.olig.mean <- df.olig %>% mutate(mean = (df.olig$Control + df.olig$Heroin) / 2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/")
write.table(df.gaba.mean[,c(1,4)], "Gaba_ABC_meanTPM.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(df.glu.mean[,c(1,4)], "Glu_ABC_meanTPM.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(df.olig.mean[,c(1,4)], "Olig_ABC_meanTPM.txt", quote=F, row.names=F, col.names=F, sep="\t")
df.gaba.const <- subset(df.gaba, df.gaba$Control >= 1 & df.gaba$Heroin >= 1)
df.glu.const <- subset(df.glu, df.glu$Control >= 1 & df.glu$Heroin >= 1)
df.olig.const <- subset(df.olig, df.olig$Control >= 1 & df.olig$Heroin >= 1)
write.table(data.frame(df.gaba.const[,1]), "Gaba_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(data.frame(df.glu.const[,1]), "Glu_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(data.frame(df.olig.const[,1]), "Olig_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
df.gaba.glu <- inner_join(df.gaba.const, df.glu.const, by="X")
df.gaba.glu.olig <- inner_join(df.gaba.glu, df.olig.const, by="X")
write.table(data.frame(df.gaba.glu.olig[,1]), "All_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
## merge chipseq bam files
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_BAM/
# ls *.bam > files.gaba.bamlist
bamtools merge -list files.gaba.bamlist -out mrege.SOX.sorted.bam
python src/run.neighborhoods.py \
--candidate_enhancer_regions /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted.candidateRegions.bed \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.collabsed.bed \
--H3K27ac /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_BAM/mrege.SOX.sorted.bam \
--DHS /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
--expression_table /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/Gaba_ABC_meanTPM.txt \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--ubiquitously_expressed_genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/All_ABC_constitutive.txt \
--cellType all \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/
# Step 3: computing ABC score
# If experimentally derived contact data is not available, one can run the ABC model using the powerlaw estimate only. In this case the --HiCdir argument should be excluded from predict.py and the --score_column powerlaw.Score argument should be included in predict.py. In this case the ABC.Score column of the predictions file will be set to NaN. The powerlaw.Score column of the output prediction files will be the relevant Score column to use.
python src/predict.py \
--enhancers example_chr22/ABC_output/Neighborhoods/EnhancerList.txt \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.genelist.bed \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--score_column powerlaw.Score \
--hic_resolution 5000 \
--scale_hic_using_powerlaw \
--threshold .02 \
--cellType K562 \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/ \
--make_all_putative
# Step 4: get prediction files for variant overlap
python src/getVariantOverlap.py \
--all_putative EnhancerPredictionsAllPutative.txt.gz \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--outdir .
10 December 2021 http://bioconductor.org/packages/release/bioc/vignettes/variancePartition/inst/doc/variancePartition.pdf
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("variancePartition")
library('variancePartition')
data(varPartData)
form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch)
varPart <- fitExtractVarPartModel(geneExpr, form, info )
vp <- sortCols( varPart )
fig <- plotVarPart( vp )
ggsave(file, fig)
# Set the seed so our results are reproducible:
set.seed(12345)
library(tidyr)
library('variancePartition')
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
metadata <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
meta <- separate(metadata, Sample, c("Sample.ID", "extension"), sep="_")
meta.df <- meta[,c(1,4:ncol(meta))]
meta.sample <- subset(meta.df, meta.df$Sample.ID %in% samples$Sample.ID)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.narrowpeak.peakset.counts.txt", header=T, sep="\t")
counts.df <- counts[,4:151]
counts.mat <- round(counts.df)
colnames(counts.mat) <- meta.sample$Sample.ID
form <- ~ AGE + (1|SEX) + (1|GROUP) + (1|Celltype)
varPart <- fitExtractVarPartModel(counts.mat, form, meta.sample )
vp <- sortCols( varPart )
fig <- plotVarPart( vp )
ggsave("peaks.variancePartition.all.pdf", fig)
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac/peaks.variancePartition.all.pdf .
# try single cell type
counts.mat.t <- t(counts.mat)
counts.celltype <- cbind(meta.sample[,1:2], counts.mat.t)
counts.gaba <- subset(counts.celltype, counts.celltype$Celltype == "GABA")
counts.glu <- subset(counts.celltype, counts.celltype$Celltype == "GLU")
counts.olig <- subset(counts.celltype, counts.celltype$Celltype == "OLIG")
counts.gaba.mat <- t(counts.gaba[,3:ncol(counts.gaba)])
counts.glu.mat <- t(counts.glu[,3:ncol(counts.glu)])
counts.olig.mat <- t(counts.olig[,3:ncol(counts.olig)])
meta.gaba <- subset(meta.sample, meta.sample$Celltype == "GABA")
meta.glu <- subset(meta.sample, meta.sample$Celltype == "GLU")
meta.olig <- subset(meta.sample, meta.sample$Celltype == "OLIG")
colnames(counts.gaba.mat) <- meta.gaba$Sample.ID
colnames(counts.glu.mat) <- meta.glu$Sample.ID
colnames(counts.olig.mat) <- meta.olig$Sample.ID
form <- ~ AGE + (1|SEX) + (1|GROUP)
varPart <- fitExtractVarPartModel(counts.gaba.mat, form, meta.gaba )
vp <- sortCols( varPart )
fig <- plotVarPart( vp )
ggsave("peaks.variancePartition.gaba.pdf", fig)
form <- ~ AGE + (1|SEX) + (1|GROUP)
varPart <- fitExtractVarPartModel(counts.glu.mat, form, meta.glu )
vp <- sortCols( varPart )
fig <- plotVarPart( vp )
ggsave("peaks.variancePartition.glu.pdf", fig)
form <- ~ AGE + (1|SEX) + (1|GROUP)
varPart <- fitExtractVarPartModel(counts.olig.mat, form, meta.olig )
vp <- sortCols( varPart )
fig <- plotVarPart( vp )
ggsave("peaks.variancePartition.olig.pdf", fig)
summary(vp)
# GROUP AGE SEX Residuals
# Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.1376
# 1st Qu.:0.04365 1st Qu.:0.008131 1st Qu.:0.00000 1st Qu.:0.6503
# Median :0.19349 Median :0.021564 Median :0.00000 Median :0.7532
# Mean :0.19086 Mean :0.031599 Mean :0.02765 Mean :0.7499
# 3rd Qu.:0.29190 3rd Qu.:0.043630 3rd Qu.:0.00000 3rd Qu.:0.8698
# Max. :0.72119 Max. :0.511741 Max. :0.81666 Max. :1.0000
## Notes: For ATAC data it is a lot of residuals but what is confusing to me which is the same thing i ran into when i did proportion variance using a different method is that the celltype is showing very minimal variance explained but we know that cell type is the major factor in separating the samples by looking at PCA and UMAP. It does look like sex might be VERY important for a specific subset of peaks... look into those peaks and what genes they are associated with...
head(varPart[order(varPart$SEX, decreasing=TRUE),])
# GROUP SEX AGE Residuals
# 56954 2.412925e-02 0.8166598 0.006879660 0.1523313
# 62984 1.474341e-02 0.8108891 0.015078543 0.1592889
# 53454 0.000000e+00 0.8014735 0.006231425 0.1922951
# 41746 1.807537e-02 0.7979112 0.020406026 0.1636074
# 67806 7.008233e-11 0.7943158 0.003968134 0.2017161
# 17062 2.241073e-11 0.7888856 0.003086976 0.2080274
### Look at genes with sex contribution for gaba
form <- ~ AGE + (1|SEX) + (1|GROUP)
varPart <- fitExtractVarPartModel(counts.gaba.mat, form, meta.gaba )
vp <- sortCols( varPart )
nrow(vp)
# 145688
gaba.sex <- subset(varPart, varPart$SEX > 0.50)
nrow(gaba.sex)
# 3548
head(gaba.sex[order(gaba.sex$SEX, decreasing=TRUE),])
# GROUP SEX AGE Residuals
# 44915 0.000000000 0.9723926 6.706626e-05 0.02754029
# 133259 0.000000000 0.9636905 5.131962e-03 0.03117756
# 70999 0.000000000 0.9431746 5.916773e-03 0.05090863
# 43051 0.005998034 0.9371322 3.997768e-03 0.05287204
# 143914 0.006464732 0.9363786 3.828483e-03 0.05332816
# 38485 0.000000000 0.9281969 1.950472e-03 0.06985265
gaba.group.sex <- subset(varPart, varPart$SEX > 0.40 & varPart$GROUP > 0.40)
nrow(gaba.group.sex)
# 2
# GROUP SEX AGE Residuals
# 101529 0.4265484 0.4130875 0.005726043 0.1546381
# 125948 0.4145981 0.4019596 0.011713842 0.1717284
** NEED TO ** - generate file for each celltype (including chr, start, end for each peak), run variancePartition, identify peaks with high sex/group variance explained, run bedtools closest with gene gff file, identify genes that the sex-driven peaks are associated with…
# salloc -A SYB105 -N 2 -t 4:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
# Set the seed so our results are reproducible:
set.seed(12345)
library(tidyr)
library('variancePartition')
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.hmmratac.summit.sample.outlier.csv")
metadata <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
meta <- separate(metadata, Sample, c("Sample.ID", "extension"), sep="_")
meta.df <- meta[,c(1,4:ncol(meta))]
meta.sample <- subset(meta.df, meta.df$Sample.ID %in% samples$Sample.ID)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/hmmratac")
counts <- read.delim("all.outliers.narrowpeak.peakset.counts.txt", header=T, sep="\t")
counts.id <- unite(counts, "peak.id", c(CHR, START, END), sep=":")
counts.df <- counts.id[,-1]
rownames(counts.df) <- counts.id[,1]
counts.mat <- round(counts.df)
colnames(counts.mat) <- meta.sample$Sample.ID
# GABA
counts.mat.t <- t(counts.mat)
counts.celltype <- cbind(meta.sample[,1:2], counts.mat.t)
counts.gaba <- subset(counts.celltype, counts.celltype$Celltype == "GABA")
counts.gaba.mat <- t(counts.gaba[,3:ncol(counts.gaba)])
meta.gaba <- subset(meta.sample, meta.sample$Celltype == "GABA")
colnames(counts.gaba.mat) <- meta.gaba$Sample.ID
form <- ~ AGE + (1|SEX) + (1|GROUP)
varPart <- fitExtractVarPartModel(counts.gaba.mat, form, meta.gaba )
vp <- sortCols( varPart )
nrow(vp)
# 145688
gaba.sex <- subset(varPart, varPart$SEX > 0.50)
nrow(gaba.sex)
# 3548
head(gaba.sex[order(gaba.sex$SEX, decreasing=TRUE),])
# GROUP SEX AGE Residuals
# NC_000005.10_103886080_103886580 0.000000000 0.9723926 6.706626e-05 0.02754029
# NC_000020.11_11001645_11002145 0.000000000 0.9636905 5.131962e-03 0.03117756
# NC_000009.12_12883264_12883764 0.000000000 0.9431746 5.916773e-03 0.05090863
# NC_000005.10_58200393_58200893 0.005998034 0.9371322 3.997768e-03 0.05287204
# NC_000023.11_80258164_80258664 0.006464732 0.9363786 3.828483e-03 0.05332816
# NC_000004.12_109065836_109066336 0.000000000 0.9281969 1.950472e-03 0.06985265
gaba.group.sex <- subset(varPart, varPart$SEX > 0.40 & varPart$GROUP > 0.40)
nrow(gaba.group.sex)
# 2
head(gaba.group.sex[order(gaba.group.sex$SEX, decreasing=TRUE),])
# GROUP SEX AGE Residuals
# NC_000013.11_66482806_66483306 0.4265484 0.4130875 0.005726043 0.1546381
# NC_000018.10_52407619_52408119 0.4145981 0.4019596 0.011713842 0.1717284
summary(vp)
# GROUP AGE SEX Residuals
# Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.02754
# 1st Qu.:0.05221 1st Qu.:0.004563 1st Qu.:0.00000 1st Qu.:0.56829
# Median :0.23785 Median :0.017559 Median :0.00000 Median :0.69248
# Mean :0.22692 Mean :0.033852 Mean :0.04546 Mean :0.69376
# 3rd Qu.:0.35456 3rd Qu.:0.043401 3rd Qu.:0.00000 3rd Qu.:0.83656
# Max. :0.83113 Max. :0.434295 Max. :0.97239 Max. :1.00000
gaba.group.sex <- subset(varPart, varPart$SEX > 0.10 & varPart$GROUP > 0.10)
nrow(gaba.group.sex)
# 7507
gaba.group.sex <- subset(varPart, varPart$SEX > 0.20 & varPart$GROUP > 0.40)
nrow(gaba.group.sex)
# 276
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
write.table(gaba.group.sex, "gaba.group.sex.variance.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
colnames(gene) <- c("chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
library(tidygenomics)
vp$id <- rownames(vp)
peaks <- data.frame(vp)
peaks.coord <- separate(peaks, id, c("chr", "start", "end"), sep=":")
peaks.coord$start <- as.numeric(peaks.coord$start)
peaks.coord$end <- as.numeric(peaks.coord$end)
peaks.gene <- genome_join_closest(peaks.coord, gene.id, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
head(peaks.gene[order(peaks.gene$SEX, decreasing=TRUE),])
subset(peaks.gene, peaks.gene$SEX > 0.40 & peaks.gene$GROUP > 0.40)
# GROUP AGE SEX Residuals chr.x start.x end.x
# 110636 0.4265484 0.005726043 0.4130875 0.1546381 NC_000013.11 66482806 66483306
# 137303 0.4145981 0.011713842 0.4019596 0.1717284 NC_000018.10 52407619 52408119
# chr.y source annotation start.y end.y dot strand
# 110636 NC_000013.11 BestRefSeq%2CGnomon gene 66302834 67230336 . -
# 137303 NC_000018.10 BestRefSeq%2CGnomon gene 52340172 53535899 . +
# dot2 gene gid distance
# 110636 . gene_id PCDH9 0
# 137303 . gene_id DCC 0
### what happens if we run variance without female samples...
counts.mat.t <- t(counts.mat)
counts.celltype <- cbind(meta.sample[,c(1:2,5)], counts.mat.t)
counts.gaba <- subset(counts.celltype, counts.celltype$Celltype == "GABA" & counts.celltype$SEX == "m")
counts.gaba.mat <- t(counts.gaba[,4:ncol(counts.gaba)])
meta.gaba <- subset(meta.sample, meta.sample$Celltype == "GABA" & meta.sample$SEX == "m")
colnames(counts.gaba.mat) <- meta.gaba$Sample.ID
form <- ~ AGE + (1|GROUP)
varPart <- fitExtractVarPartModel(counts.gaba.mat, form, meta.gaba )
vp <- sortCols( varPart )
vp$id <- rownames(vp)
peaks <- data.frame(vp)
peaks.coord <- separate(peaks, id, c("chr", "start", "end"), sep=":")
peaks.coord$start <- as.numeric(peaks.coord$start)
peaks.coord$end <- as.numeric(peaks.coord$end)
peaks.gene <- genome_join_closest(peaks.coord, gene.id, by=c("chr", "start", "end"), distance_column_name="distance", mode="left")
head(peaks.gene[order(peaks.gene$GROUP, decreasing=TRUE),])
## [1] "SampleID" "Tissue" "Factor" "Condition" "Treatment"
## [6] "Replicate" "bamReads" "ControlID" "bamControl" "Peaks"
## [11] "PeakCaller"
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("ATAC.All.Metadata.txt", header=T, sep="\t")
nrow(subset(df, df$SEX == "m"))
# 143
nrow(subset(df, df$SEX == "f"))
# 36
nrow(subset(df, df$SEX == "m" & df$GROUP == "Control"))
# 69 <-- 69 male control, 74 male heroin
nrow(subset(df, df$SEX == "f" & df$GROUP == "Control"))
# 21 <-- 21 female control, 15 female heroin
library(dplyr)
library(tidyr)
df2 <- separate(df, Sample, c("Sample.ID", "SampleSeq"), sep="_")
df2$PeakName <- df$Sample
df2$PeakCaller <- "bed"
df2$PeaksDir <- "bwa.output/macs.output/peaks/"
df2$PeaksExt <- ".macs2.bed.gz"
df2$BamName <- df$Sample
df2$BamDir <- "bwa.output/rmdups.bam/"
df2$BamExt <- ".rmdups.bam"
df3 <- unite(df2, Peaks, c(PeaksDir, PeakName, PeaksExt), sep="")
df4 <- unite(df3, bamReads, c(BamDir, BamName, BamExt), sep="")
df4$Factor <- NA
df4$Replicate <- 1
df4$Control.ID <- NA
df4$bamControl <- NA
df4$Tissue <- df4$Celltype
df4$Condition <- df4$GROUP
df4$Treatment <- df4$GROUP
df4$Sex <- df4$SEX
df.sample <- df4[,c(1,21,17,22,23,18,16,19,20,14,15,24)]
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
write.csv(df.sample, file = 'diffbind.sample.sex.csv', row.names=F)
# remove outlier samples based on number of mapped reads (>40M) and percentage of reads mapped (>70%)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.csv("diffbind.sample.sex.csv", header=T, sep=",")
df.base <- read.csv("diffbind.macs2.outlier.sample.csv", header=T, sep=",")
df.outlier <- subset(df, df$Sample.ID %in% df.base$Sample.ID)
write.csv(df.outlier, file = 'diffbind.outlier.sample.sex.csv', row.names=F)
df.gaba <- subset(df.outlier, df.outlier$Tissue == "GABA")
df.glu <- subset(df.outlier, df.outlier$Tissue == "GLU")
df.olig <- subset(df.outlier, df.outlier$Tissue == "OLIG")
df.gaba$Tissue <- df.gaba$Sex
df.glu$Tissue <- df.glu$Sex
df.olig$Tissue <- df.olig$Sex
write.csv(df.gaba[,1:12], file = 'diffbind.outlier.sample.gaba.sex.csv', row.names=F)
write.csv(df.glu[,1:12], file = 'diffbind.outlier.sample.glu.sex.csv', row.names=F)
write.csv(df.olig[,1:12], file = 'diffbind.outlier.sample.olig.sex.csv', row.names=F)
# salloc -A SYB105 -p gpu -N 1 -t 2:00:00
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
library(DiffBind)
library(tidyverse)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.outlier.sample.gaba.sex.csv")
df <- dba(sampleSheet="key.files/diffbind.outlier.sample.gaba.sex.csv")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
# ID Tissue Condition Treatment Replicate Intervals
# 51 m:Control m Control Control 1 52796
# 52 f:Control f Control Control 1 57077
# 53 f:heroin f heroin heroin 1 56596
# 54 m:heroin m heroin heroin 1 43883
write.table(df.counted$peaks[[54]], "diffbind.MACS2.consensus.gaba.male.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[51]], "diffbind.MACS2.consensus.gaba.male.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[53]], "diffbind.MACS2.consensus.gaba.female.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[52]], "diffbind.MACS2.consensus.gaba.female.control.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.outlier.sample.glu.sex.csv")
df <- dba(sampleSheet="key.files/diffbind.outlier.sample.glu.sex.csv")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
# 52 m:Control m Control Control 1 90723
# 53 f:Control f Control Control 1 78754
# 54 f:heroin f heroin heroin 1 74804
# 55 m:heroin m heroin heroin 1 77119
write.table(df.counted$peaks[[54]], "diffbind.MACS2.consensus.glu.female.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[53]], "diffbind.MACS2.consensus.glu.female.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[55]], "diffbind.MACS2.consensus.glu.male.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[52]], "diffbind.MACS2.consensus.glu.male.control.txt", quote=F, row.names=F, sep="\t")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/")
samples <- read.csv("key.files/diffbind.outlier.sample.olig.sex.csv")
df <- dba(sampleSheet="key.files/diffbind.outlier.sample.olig.sex.csv")
df.counted = dba.peakset(df, consensus = c(DBA_TISSUE,DBA_CONDITION), minOverlap=0.66)
df.counted
# 48 m:Control m Control Control 1 37767
# 49 f:Control f Control Control 1 42773
# 50 f:heroin f heroin heroin 1 47865
# 51 m:heroin m heroin heroin 1 48699
write.table(df.counted$peaks[[50]], "diffbind.MACS2.consensus.olig.female.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[49]], "diffbind.MACS2.consensus.olig.female.control.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[51]], "diffbind.MACS2.consensus.olig.male.heroin.txt", quote=F, row.names=F, sep="\t")
write.table(df.counted$peaks[[48]], "diffbind.MACS2.consensus.olig.male.control.txt", quote=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts
# make ROTS_input.csv file with consensus peaks and bam directories
# sed '1d' diffbind.MACS2.consensus.gaba.male.control.txt > diffbind.MACS2.consensus.gaba.male.control.bed
# sed '1d' diffbind.MACS2.consensus.gaba.male.heroin.txt > diffbind.MACS2.consensus.gaba.male.heroin.bed
# sed '1d' diffbind.MACS2.consensus.gaba.female.control.txt > diffbind.MACS2.consensus.gaba.female.control.bed
# sed '1d' diffbind.MACS2.consensus.gaba.female.heroin.txt > diffbind.MACS2.consensus.gaba.female.heroin.bed
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
df <- read.delim("diffbind.outlier.sample.gaba.sex.csv", header=T, sep=",")
gaba.control <- subset(df, df$Condition == "Control" & df$Tissue == "m")
gaba.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "m")
gaba.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.male.control.bed"
gaba.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.male.heroin.bed"
gaba.control$rep <- 1
gaba.heroin$rep <- 2
gaba.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.control$bamReads)
gaba.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.heroin$bamReads)
gaba <- rbind(gaba.control[,c(13,15,14)],gaba.heroin[,c(13,15,14)])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
write.table(gaba, "ROTS_input_gaba_male.csv", quote=F, row.names=F, col.names=F, sep=";")
gaba.control <- subset(df, df$Condition == "Control" & df$Tissue == "f")
gaba.heroin <- subset(df, df$Condition == "heroin" & df$Tissue == "f")
gaba.control$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.female.control.bed"
gaba.heroin$peaks <- "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/diffbind.MACS2.consensus.gaba.female.heroin.bed"
gaba.control$rep <- 1
gaba.heroin$rep <- 2
gaba.control$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.control$bamReads)
gaba.heroin$bam <- paste0("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/", gaba.heroin$bamReads)
gaba <- rbind(gaba.control[,c(13,15,14)],gaba.heroin[,c(13,15,14)])
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
write.table(gaba, "ROTS_input_gaba_female.csv", quote=F, row.names=F, col.names=F, sep=";")
## run on macs2 peaks (diffbind consensus)
#### make sure to install all R packages within conda environment before submitting (ROTS, GenomicRanges, rtracklayer, Rsubread, Rsamtools, ade4, made4)
#sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/Rscriptandes.sh
#** need to run separately for each... has to have the file name ROTS_input.csv
library(ROTS)
library(GenomicRanges)
library(rtracklayer)
library(Rsubread)
library(Rsamtools)
library(ade4)
library(made4)
###################################
##
## load sources
##
###################################
source("code/differential_call.R")
source("code/readcount.R")
source("code/loaddata.R")
source("code/makepeakset.R")
source("code/Normalization.R")
source("code/filtering.R")
source("code/plots.R")
###################################
##
## Main
##
###################################
#Reading in the complete data
#Setting the filesPath and files variable for complete data
# To import narrowPeak files
extraCols_narrowPeak <- c(signalValue = "numeric", pValue = "numeric",qValue = "numeric", peak = "integer")
# To import broadPeak files
#extraCols_broadPeak <- c(signalValue = "numeric", pValue = "numeric",qValue = "numeric")
#Object_holder <- createDataObject("ROTS_input.csv",extraCols = extraCols_narrowPeak)
Object_holder <- createDataObject("ROTS_input.csv")
#create consensus peaks
Object_holder <- makePeakSet(DB_object = Object_holder,lower = 1)
#get the consensus to gtf format
Object_holder <- formatConsensus(Object_holder)
### create readers
Object_holder <- readcount(Object_holder)
#save(Object_holder, file = "RData/object.RData")
#rowWiseVarience <- apply(Object_holder$CountsNorm,MARGIN = 1,FUN = var)
#Object_holder_filter <- filter(Object_holder,filtering = "RWV",rowWiseVarience)
normMethod <- "DESeq"
Object_holder_norm <- normalization(Object_holder,normMethod=normMethod)
#MAplot(Object_holder,normalised=TRUE,"plots/MAplot_RLE_matrix_lower1_pfalse.png")
#PCAplot(Object_holder,normalised=TRUE,"plots/pca_RLE_data_lower1_pfalse.png")
print("ROTS call")
Object_holder2 <- differentialCall(Object_holder_norm, B=100, K=floor(nrow(Object_holder$Filtered_NormCounts)/2), seed = 14,paired = FALSE, normalized = TRUE)
save(Object_holder2, file = "object_lower1_pfalse_olig_female.RData")
results <-outputGeneration(Object_holder2,fdr=1)
write.table(results,"myresults_lower1_pfalse_olig_male.bed",quote = FALSE,row.names = FALSE)
save(Object_holder2,file = "object_lower1_pfalse2_olig_female.RData")
object_lower1_pfalse_gaba_female.RData
# ROTS results:
#
# Number of resamplings: 100
#
# a1: 0
# a2: 1
# Top list size: 8100
# Reproducibility value: 0.3617
# Z-score: 1.929924
#
# 67644 rows satisfy the condition. Only ten first rows are
# displayed, see the return value for the whole output.
# Row ROTS-statistic pvalue FDR
# peak_30634 30634 -10.069757 6.726391e-06 0.0000000
# peak_47582 47582 8.974851 1.433978e-05 0.2500000
# peak_7020 7020 8.301701 2.624032e-05 0.2500000
# peak_7874 7874 -8.002233 3.348412e-05 0.2500000
# peak_43706 43706 7.617473 4.752824e-05 0.2727273
# peak_25989 25989 7.487494 5.447638e-05 0.2727273
# peak_61091 61091 7.345101 6.216368e-05 0.2727273
# peak_46675 46675 7.278253 6.682041e-05 0.2727273
# peak_2983 2983 7.277702 6.689433e-05 0.2727273
# peak_42052 42052 7.130740 7.886878e-05 0.2727273
object_lower1_pfalse_gaba_male.RData
# ROTS results:
#
# Number of resamplings: 100
#
# a1: 0
# a2: 1
# Top list size: 3400
# Reproducibility value: 0.3393118
# Z-score: 3.327007
#
# 54592 rows satisfy the condition. Only ten first rows are
# displayed, see the return value for the whole output.
# Row ROTS-statistic pvalue FDR
# peak_39422 39422 -5.675108 4.579426e-07 0
# peak_30807 30807 5.616158 5.495311e-07 0
# peak_25865 25865 -5.593543 6.411196e-07 0
# peak_14662 14662 -5.407268 1.099062e-06 0
# peak_8048 8048 5.369008 1.190651e-06 0
# peak_46410 46410 -5.244844 2.198124e-06 0
# peak_23429 23429 -5.243137 2.198124e-06 0
# peak_19381 19381 5.190722 2.656067e-06 0
# peak_23182 23182 -5.164405 2.839244e-06 0
# peak_28932 28932 5.163728 2.839244e-06 0
object_lower1_pfalse_glu_male.RData
# Number of resamplings: 100
#
# a1: 0
# a2: 1
# Top list size: 9200
# Reproducibility value: 0.4212272
# Z-score: 3.624283
#
# 93826 rows satisfy the condition. Only ten first rows are
# displayed, see the return value for the whole output.
# Row ROTS-statistic pvalue FDR
# peak_30194 30194 -7.095183 0.000000e+00 0
# peak_78776 78776 6.600575 0.000000e+00 0
# peak_29281 29281 -6.518922 5.329013e-08 0
# peak_15573 15573 -6.356710 5.329013e-08 0
# peak_93394 93394 -6.303589 1.065803e-07 0
# peak_17561 17561 6.296713 1.065803e-07 0
# peak_29374 29374 -6.273190 1.065803e-07 0
# peak_15272 15272 -6.251286 1.598704e-07 0
# peak_91812 91812 -6.191205 2.131605e-07 0
# peak_22957 22957 6.186761 2.131605e-07 0
object_lower1_pfalse_glu_female.RData
# Number of resamplings: 100
#
# a1: 0
# a2: 1
# Top list size: 9300
# Reproducibility value: 0.3238806
# Z-score: 1.900261
#
# 89991 rows satisfy the condition. Only ten first rows are
# displayed, see the return value for the whole output.
# Row ROTS-statistic pvalue FDR
# peak_8748 8748 -10.184391 1.444589e-06 0.0000000
# peak_27360 27360 8.726720 6.945139e-06 0.0000000
# peak_3384 3384 8.705373 7.000700e-06 0.0000000
# peak_85046 85046 8.366868 1.016768e-05 0.0000000
# peak_25559 25559 8.212018 1.139003e-05 0.0000000
# peak_79039 79039 7.478788 2.555811e-05 0.1111111
# peak_61090 61090 7.270769 3.172539e-05 0.1111111
# peak_45192 45192 7.187240 3.422564e-05 0.1111111
# peak_14691 14691 7.000260 4.178196e-05 0.1111111
# peak_2803 2803 6.691883 5.928371e-05 0.1666667
object_lower1_pfalse_olig_male.RData
# Number of resamplings: 100
#
# a1: 1
# a2: 0
# Top list size: 3300
# Reproducibility value: 0.8065606
# Z-score: 5.201288
#
# 49226 rows satisfy the condition. Only ten first rows are
# displayed, see the return value for the whole output.
# Row ROTS-statistic pvalue FDR
# peak_48681 48681 19845.561 8.836794e-06 0.0000000
# peak_48675 48675 13438.506 1.757201e-05 0.1794289
# peak_48944 48944 -3014.050 7.485881e-05 0.1794289
# peak_48934 48934 -2932.883 7.800756e-05 0.1794289
# peak_48678 48678 2405.350 1.010645e-04 0.1794289
# peak_48680 48680 2213.578 1.120343e-04 0.1794289
# peak_48931 48931 -1995.772 1.269654e-04 0.1794289
# peak_48676 48676 1650.872 1.607890e-04 0.1794289
# peak_48946 48946 -1598.206 1.683054e-04 0.1794289
# peak_48943 48943 -1515.556 1.785642e-04 0.1794289
object_lower1_pfalse_olig_female.RData
# Number of resamplings: 100
#
# a1: 0
# a2: 1
# Top list size: 13600
# Reproducibility value: 0.4593191
# Z-score: 1.617918
#
# 59394 rows satisfy the condition. Only ten first rows are
# displayed, see the return value for the whole output.
# Row ROTS-statistic pvalue FDR
# peak_2768 2768 13.262865 8.418359e-06 0.0000000
# peak_40021 40021 -11.940313 1.338519e-05 0.0000000
# peak_1525 1525 9.342168 5.901269e-05 0.1333333
# peak_43850 43850 8.551527 1.027882e-04 0.1333333
# peak_56226 56226 8.526970 1.055662e-04 0.1333333
# peak_56289 56289 8.401671 1.143213e-04 0.1333333
# peak_9549 9549 8.178381 1.352830e-04 0.1333333
# peak_23964 23964 8.176298 1.357039e-04 0.1333333
# peak_55216 55216 -8.094343 1.433646e-04 0.1333333
# peak_37275 37275 8.051438 1.463111e-04 0.1333333
#setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
gaba <- read.delim("myresults_lower1_pfalse_gaba_male.bed", header=T, sep=" ")
## negative logfc = higher in heroin
summary(gaba$logfc)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# -1.091929 -0.153461 -0.010138 -0.005156 0.137253 1.140007
nrow(subset(gaba, gaba$logfc > 0.5))
# 635 gaba peaks are differential (higher in control)
nrow(subset(gaba, gaba$logfc < -0.5))
# 358 gaba peaks are differential (higher in heroin)
nrow(subset(gaba, gaba$logfc > 0))
# 26262 gaba peaks are higher in control
nrow(subset(gaba, gaba$logfc < 0))
# 28330 gaba peaks are higher in heroin
nrow(subset(gaba, gaba$logfc > 0.5 & gaba$pvalue < 0.05))
# 628
nrow(subset(gaba, gaba$logfc < -0.5 & gaba$pvalue < 0.05))
# 339
# > nrow(subset(gaba, gaba$logfc < -0.5 & gaba$pvalue < 0.01))
# [1] 279 (UP in heroin) <-- same trend in ChIP-seq (more down than up in male)
# > nrow(subset(gaba, gaba$logfc > 0.5 & gaba$pvalue < 0.01))
# [1] 536 (DOWN in heroin)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
gaba <- read.delim("myresults_lower1_pfalse_gaba_female.bed", header=T, sep=" ")
## negative logfc = higher in heroin
summary(gaba$logfc)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# -3.620983 -0.201205 0.004545 -0.005363 0.195612 3.057684
nrow(subset(gaba, gaba$logfc > 0.5))
# 2994 gaba peaks are differential (higher in control)
nrow(subset(gaba, gaba$logfc < -0.5))
# 3812 gaba peaks are differential (higher in heroin)
nrow(subset(gaba, gaba$logfc > 0))
# 34250 gaba peaks are higher in control
nrow(subset(gaba, gaba$logfc < 0))
# 33394 gaba peaks are higher in heroin
nrow(subset(gaba, gaba$logfc > 0.5 & gaba$pvalue < 0.05))
# 932
nrow(subset(gaba, gaba$logfc < -0.5 & gaba$pvalue < 0.05))
# 1479
# > nrow(subset(gaba, gaba$logfc < -0.5 & gaba$pvalue < 0.01))
# [1] 374 (UP in heroin) <-- same trend in ChIP-seq (more down than up in female)... but WAY more up in ChIP
# > nrow(subset(gaba, gaba$logfc > 0.5 & gaba$pvalue < 0.01))
# [1] 212 (DOWN in heroin)
# cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts
# awk '{if ($12 > 0.5 || $12 < -0.5) print $0}' myresults_lower1_pfalse_gaba_female.bed | sed -e 's/ /\t/g' > rots.gaba.female.differential.log2gr0.5.bed
# awk '{if ($12 > 0.5 || $12 < -0.5) print $0}' myresults_lower1_pfalse_gaba_male.bed | sed -e 's/ /\t/g' > rots.gaba.male.differential.log2gr0.5.bed
#
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/rots.gaba.female.differential.log2gr0.5.bed /Users/27n/Dropbox\ \(ORNL\)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential/.
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts/rots.gaba.male.differential.log2gr0.5.bed /Users/27n/Dropbox\ \(ORNL\)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential/.
library(tidygenomics)
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Genome")
gene <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.gtf", header=F, sep="\t")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
gaba.deg <- subset(gaba.int, gaba.int[,3] != "NO")
# 396
colnames(gene) <- c("Chr", "source", "annotation", "Start", "End", "dot", "strand", "dot2", "info")
gene.df <- separate(gene, "info", c("gene", "id"), sep=" ")
gene.id <- separate(gene.df, "id", c("gid"), sep=";")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("rots.gaba.male.differential.log2gr0.5.bed", header=T, sep="\t")
# 994
gaba.gene <- genome_join_closest(gaba, gene.id, by=c("Chr", "Start", "End"), distance_column_name="distance", mode="left")
summary(gaba.gene$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 21538 9326 1206653
hist(gaba.gene$distance)
gaba.gene.deg <- subset(gaba.gene, gaba.gene$gid %in% gaba.deg$Gene_Symbol)
nrow(gaba.gene.deg)
# 11
length(unique(gaba.gene.deg$gid))
# 10
gaba.gene.deg[,c(22,12,23)] #gid, log2fc, distance
# gid logfc distance
# 5 VGF 0.8047382 0 ***
# 45 TRIB1 0.5832776 0
# 49 VGF 0.5812751 0 ***
# 132 MFAP3L 0.6224854 0
# 195 LOC105376290 -0.6084616 7752
# 405 RCN1 0.5935030 0
# 500 LINC01619 0.6305883 0
# 660 PCSK1 0.5057825 0 ***
# 873 APOC1 -0.5505838 0
# 891 CDH19 0.6384764 28594
# 1018 C20orf203 -0.5337565 10210
summary(gaba.gene.deg$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 4232 3876 28594
hist(gaba.gene.deg$distance)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
gaba <- read.delim("rots.gaba.female.differential.log2gr0.5.bed", header=T, sep="\t")
# 6806
gaba.gene <- genome_join_closest(gaba, gene.id, by=c("Chr", "Start", "End"), distance_column_name="distance", mode="left")
summary(gaba.gene$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0 0 0 14989 2314 3598727 11
hist(gaba.gene$distance)
gaba.gene.deg <- subset(gaba.gene, gaba.gene$gid %in% gaba.deg$Gene_Symbol)
nrow(gaba.gene.deg)
# 59
length(unique(gaba.gene.deg$gid))
# 48
gaba.gene.deg[,c(22,12,23)]
# gid logfc distance
# 59 LINC02752 -0.5596599 36522
# 238 LOC105375492 0.5208297 0
# 312 CDH19 0.8598066 28618
# 330 ADARB2 0.8469246 4663
# 398 LOC107984750 -1.1074450 0
# 411 FOLH1 0.7695809 42563
# 468 HPN-AS1 -0.6345851 0
# 473 FAT1 0.7236985 9523
# 687 OACYLP 0.6608277 0
# 730 VRK2 -0.7143743 21689
# 760 EPHA2 -0.5679029 0
# 782 LOC105375534 -0.6787270 17650
# 909 LOC107984525 -0.5182956 1653
# 947 RFTN2 -1.2019779 0
# 952 CTNNA3 0.6083859 0
# 1298 VPS33B-DT -0.6436861 18552
# 1317 SNHG8 0.5219604 0
# 1451 TPI1P2 0.6063698 9228
# 1476 LOC107984516 -0.5819488 116411
# 1649 PRR18 0.6405344 8339
# 1754 LOC102724968 0.5423262 0
# 1960 S100B -1.1802705 0
# 1966 LINC01619 0.7933584 0
# 1985 HAVCR1 0.5996236 7345
# 1990 LOC107984516 0.7223717 68081
# 2010 RCN1 0.5442883 0
# 2022 GAS1 0.5268938 34637
# 2220 TAC1 -0.5950813 32929
# 2671 HAVCR2 -0.5505262 5147
# 2783 CTNNA3 -0.5777344 0
# 2821 PCSK1 0.5285237 0 ***
# 2883 LOC105374051 -0.5287570 30003
# 2967 LINC01619 -0.5373097 0
# 3171 CTNNA3 0.5626638 0
# 3196 RNU6-2 0.5832007 0
# 3605 DUSP5 0.7745962 1187 ***
# 3820 LOC105379412 -0.6582849 0
# 4048 RNU6-8 0.6641185 0
# 4120 BCAS1 -0.5275112 26153
# 4186 LOC107984390 -0.5413628 32808
# 4505 NPAS4 0.5208521 0 ***
# 4841 LINC00639 0.8148753 31928
# 4895 HAVCR1 0.6408532 6205
# 5056 TMEM144 0.5033154 0
# 5244 TBXAS1 0.5189381 0
# 5344 SHC4 -0.5091283 0 ***
# 5463 FOS 0.5065944 40825 ***
# 5537 PLP1 -0.5387689 0
# 5667 LINC00639 -0.5539002 10865
# 5683 HPN-AS1 -0.6119247 0
# 5715 ARHGAP28 0.5329917 0
# 5761 CTNNA3 0.8071776 0
# 6208 LINC02594 -0.5215768 0
# 6405 LOC107985994 -0.5788280 0
# 6656 ADARB2 0.5077638 0
# 6809 MAL -0.5976155 30499
# 6992 LOC102724968 -0.6853386 0
# 7033 LOC107984696 0.5006569 0
# 7312 CTNNA3 -0.5466261 0
summary(gaba.gene.deg$distance)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0 0 0 11424 18101 116411
hist(gaba.gene.deg$distance)
#### Looking in the wrong directions?
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC
sed '1d' Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.log2gr0.5.bed | cut -f 2-4,12 | sort -k 1,1 -k 2,2n > Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.log2gr0.5.sorted.bed
sed '1d' Consensus.Peaks/ROTS.differential/rots.gaba.female.differential.log2gr0.5.bed | cut -f 2-4,12 | sort -k 1,1 -k 2,2n > Consensus.Peaks/ROTS.differential/rots.gaba.female.differential.log2gr0.5.sorted.bed
bedtools2/bin/closestBed -D b -id -b Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.log2gr0.5.sorted.bed > gaba.male.genes.rots.bedtools.bed
bedtools2/bin/closestBed -D b -id -b Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.female.differential.log2gr0.5.sorted.bed > gaba.female.genes.rots.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
gaba.deg <- subset(gaba.int, gaba.int[,3] != "NO")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.genes <- read.delim("gaba.male.genes.rots.bedtools.bed", header=F, sep="\t")
colnames(gaba.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
gaba.genes.df <- separate(gaba.genes, "info", c("gene", "id"), sep=" ")
gaba.genes.id <- separate(gaba.genes.df, "id", c("Gene_Symbol"), sep=";")
gaba.genes.deg <- inner_join(gaba.genes.id, gaba.deg, by="Gene_Symbol")
nrow(gaba.genes.deg)
# 12
gaba.genes.deg[,c(14,15,18,4)]
# Gene_Symbol distance log2FC log2fc
# 1 CD84 -27582 -0.9649 -0.5191476
# 2 LOC101928278 -44455 0.8639 -0.5150767
# 3 MFAP3L 0 -0.6426 0.6224854
# 4 PCSK1 0 -0.6291 0.5057825 ***
# 5 DSP -72489 -0.9849 0.6042699
# 6 VGF 0 -0.8737 0.5812751 ***
# 7 VGF 0 -0.8737 0.8047382 ***
# 8 TRIB1 0 -0.8611 0.5832776
# 9 RCN1 0 -0.5863 0.5935030
# 10 LINC01619 0 0.8301 0.6305883
# 11 CDH19 -28596 -1.3992 0.6384764
# 12 APOC1 0 -0.9114 -0.5505838
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.genes <- read.delim("gaba.female.genes.rots.bedtools.bed", header=F, sep="\t")
colnames(gaba.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
gaba.genes.df <- separate(gaba.genes, "info", c("gene", "id"), sep=" ")
gaba.genes.id <- separate(gaba.genes.df, "id", c("Gene_Symbol"), sep=";")
gaba.genes.deg <- inner_join(gaba.genes.id, gaba.deg, by="Gene_Symbol")
nrow(gaba.genes.deg)
# 55
gaba.genes.deg[,c(14,15,18,4)]
# Gene_Symbol distance log2FC log2fc
# 1 EPHA2 0 -1.1072 -0.5679029
# 2 CD84 -27574 -0.9649 -0.7511675
# 3 VRK2 -21690 -0.9269 -0.7143743
# 4 MAL -30500 -1.4698 -0.5976155
# 5 RFTN2 0 -1.3806 -1.2019779
# 6 LOC107985994 0 0.5453 -0.5788280
# 7 LOC105374051 -30004 0.5158 -0.5287570
# 8 SNHG8 0 -0.5218 0.5219604
# 9 LOC105379412 0 0.7495 -0.6582849
# 10 LOC107986196 -18099 0.6845 -0.8059755
# 11 TMEM144 0 -0.9009 0.5033154
# 12 PCSK1 0 -0.6291 0.5285237 ***
# 13 SPRY4 -205734 -0.6521 0.5398216
# 14 HAVCR1 -6207 0.8808 0.6408532
# 15 HAVCR1 -10711 0.8808 -0.5505262
# 16 LOC105375492 0 0.6508 0.5208297
# 17 TBXAS1 0 -0.8781 0.5189381
# 18 LOC105375534 -17652 0.6644 -0.6787270
# 19 TRIB1 -51017 -0.8611 -0.5373952
# 20 NXNL2 -30381 0.5365 -0.5638427
# 21 ADARB2 0 0.9507 0.5077638
# 22 CTNNA3 0 -0.9392 0.6083859
# 23 CTNNA3 0 -0.9392 -0.5466261
# 24 CTNNA3 0 -0.9392 0.5626638
# 25 CTNNA3 0 -0.9392 0.8071776
# 26 CTNNA3 0 -0.9392 -0.5777344
# 27 LINC02752 -36523 1.0827 -0.5596599
# 28 RCN1 0 -0.5863 0.5442883
# 29 FOLH1 -42565 -1.0075 0.7695809
# 30 NPAS4 0 -1.3178 0.5208521 ***
# 31 LOC105369691 -187562 -0.7737 0.7223717
# 32 LOC105369691 -235892 -0.7737 -0.5819488
# 33 LOC107984525 -1654 0.5764 -0.5182956
# 34 LINC01619 0 0.8301 0.7933584
# 35 LINC01619 0 0.8301 -0.5373097
# 36 SPRY2 -128663 -0.6808 0.5452184
# 37 RNU6-8 0 -1.1267 0.6641185
# 38 FOS -40826 -1.8009 0.5065944 ***
# 39 LOC107984696 0 0.5234 0.5006569
# 40 SHC4 0 -1.0306 -0.5091283 ***
# 41 LOC107984750 0 0.8249 -1.1074450
# 42 LINC02594 0 0.9912 -0.5215768
# 43 ARHGAP28 0 0.6531 0.5329917
# 44 OACYLP 0 0.8421 0.6608277
# 45 CDH19 -28620 -1.3992 0.8598066
# 46 RNU6-2 0 -1.5723 0.5832007
# 47 HPN-AS1 0 -2.4048 -0.6345851
# 48 HPN-AS1 0 -2.4048 -0.6119247
# 49 ZIM2 -53871 0.8765 -0.5957950
# 50 C20orf203 -14254 0.6314 -0.5809369
# 51 LOC102724968 0 0.7663 0.5423262
# 52 LOC102724968 0 0.7663 -0.6853386
# 53 BCAS1 -26155 -1.7181 -0.5275112
# 54 S100B 0 -1.1083 -1.1802705
# 55 PLP1 0 -1.9576 -0.5387689 ***
####### finding SHC4 and FOS in female but not male... look at non differential peaks in male
cd /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC
tr ' ' \\t < Consensus.Peaks/ROTS.differential/myresults_lower1_pfalse_gaba_male.bed > Consensus.Peaks/ROTS.differential/myresults_lower1_pfalse_gaba_male.tab.bed
sed '1d' Consensus.Peaks/ROTS.differential/myresults_lower1_pfalse_gaba_male.tab.bed | cut -f 2-4,12 | sort -k 1,1 -k 2,2n > Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.all.sorted.bed
bedtools2/bin/closestBed -D b -id -b Genome/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.all.sorted.bed > gaba.male.genes.rotsALL.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq")
gaba.int <- read.delim("GABA.DEG.txt", header=T, sep="\t")
gaba.deg <- subset(gaba.int, gaba.int[,3] != "NO")
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.genes <- read.delim("gaba.male.genes.rotsALL.bedtools.bed", header=F, sep="\t")
colnames(gaba.genes) <- c("chr.peak", "start.peak", "end.peak", "log2fc", "chr", "source", "annotation", "start", "end", "dot", "strand", "dot2", "info", "distance")
gaba.genes.df <- separate(gaba.genes, "info", c("gene", "id"), sep=" ")
gaba.genes.id <- separate(gaba.genes.df, "id", c("Gene_Symbol"), sep=";")
gaba.genes.deg <- inner_join(gaba.genes.id, gaba.deg, by="Gene_Symbol")
gaba.genes.deg.df <- gaba.genes.deg[,c(14,15,18,4)]
subset(gaba.genes.deg.df, gaba.genes.deg.df$Gene_Symbol == "SHC4")
# Gene_Symbol distance log2FC log2fc
# 470 SHC4 0 -1.0306 -0.1787468
# 471 SHC4 0 -1.0306 0.2932485
# 472 SHC4 0 -1.0306 0.3926758
# 473 SHC4 0 -1.0306 0.4708089 *** opposite direction of what was observed in female (just below threshold)
# 474 SHC4 0 -1.0306 0.4003152
# 475 SHC4 0 -1.0306 0.2399484
# 476 SHC4 -11563 -1.0306 0.1408785
subset(gaba.genes.deg.df, gaba.genes.deg.df$Gene_Symbol == "FOS")
# Gene_Symbol distance log2FC log2fc
# 449 FOS -40779 -1.8009 0.27554181 <-- Too far below threshold, in the same direction as what was seen in female
# 450 FOS -35713 -1.8009 -0.09131088
# 451 FOS -18995 -1.8009 0.16803847
# 452 FOS -4892 -1.8009 0.25279572
# 453 FOS -1346 -1.8009 0.34510541
# 454 FOS 0 -1.8009 0.24225795
# 455 FOS 0 -1.8009 0.36956356
# NPAS4, SHC4, FOS, PLP1 <-- all found for differential ATAC in female but NOT in male. VGF found in male NOT female. PCSK1 found in both
# ** SHC4 in male is just below the threshold i used for "differential atac" and the peak is in the opposite direction from that in female **
#### look at the closest DEG to each differential peak (regardless of other genes in between)
bedtools2/bin/closestBed -D a -id -b RNAseq/gaba.deg.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.log2gr0.5.sorted.bed > gaba.male.deg.rots.bedtools.bed
bedtools2/bin/closestBed -D a -id -b RNAseq/gaba.deg.sorted.gtf -a Consensus.Peaks/ROTS.differential/rots.gaba.female.differential.log2gr0.5.sorted.bed > gaba.female.deg.rots.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.deg <- read.delim("gaba.male.deg.rots.bedtools.bed", header=F, sep="\t")
gaba.deg.hun <- subset(gaba.deg, abs(gaba.deg$V14) < 100000)
gaba.deg.hun.df <- subset(gaba.deg.hun, !(gaba.deg.hun$V13 == "."))
gaba.deg.hun.df[,c(13,14,11,4)]
# V13 V14 V11 V4
# 72 gene_idCD84 -27582 - -0.5191476
# 90 gene_idIRF6 -79998 - -0.5646152
# 180 gene_idLOC105377082 -26450 + -0.5125658
# 183 gene_idLOC101929054 -74670 + -0.5738423
# 184 gene_idLOC101929054 -76529 + -0.5159147
# 276 gene_idMFAP3L 0 - 0.6224854
# 322 gene_idMIR583HG -41244 + 0.5041195
# 323 gene_idPCSK1 0 - 0.5057825 ***
# 378 gene_idTENT5A -81342 - 0.6036211
# 442 gene_idVGF 0 - 0.5812751 ***
# 443 gene_idVGF 0 - 0.8047382 ***
# 444 gene_idPLOD3 -55601 - 0.5016634
# 445 gene_idPLOD3 -60807 - 0.7727624
# 492 gene_idTRIB1 0 + 0.5832776
# 497 gene_idLOC105375767 -82070 + 0.6006698
# 526 gene_idLOC105376290 -7754 + -0.6084616
# 555 gene_idDDIT4 -72305 + -0.5117865
# 565 gene_idLOC105378572 -39599 + 0.5493030
# 568 gene_idCD81-AS1 -12568 - -0.5227208
# 574 gene_idRCN1 0 + 0.5935030
# 580 gene_idLOC107984338 -31384 - -0.6138288
# 624 gene_idLOC105369843 -73556 - 0.5817588
# 636 gene_idLINC01619 0 - 0.6305883
# 637 gene_idLINC01619 -86333 - 0.5047600
# 747 gene_idLOC105371399 -16043 + -0.5073893
# 782 gene_idTMC6 -35904 - -0.7577361
# 783 gene_idTMC6 -36821 - -0.5561223
# 814 gene_idCDH19 -28596 - 0.6384764
# 821 gene_idMIR7-3HG -58682 + -0.5435679
# 851 gene_idHPN-AS1 -28799 - -0.5049006
# 853 gene_idTMEM147 -96766 + -0.5007796
# 857 gene_idGGN -44607 - -0.5856537
# 869 gene_idAPOC1 0 + -0.5505838
# 870 gene_idFOSB -83200 + -0.5510145 ***
# 952 gene_idCDC42EP1 -53848 + -0.5138011
# 986 gene_idPNMA6F -59918 - -0.5291648
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.deg <- read.delim("gaba.female.deg.rots.bedtools.bed", header=F, sep="\t")
gaba.deg.hun <- subset(gaba.deg, abs(gaba.deg$V14) < 100000)
gaba.deg.hun.df <- subset(gaba.deg.hun, !(gaba.deg.hun$V13 == "."))
gaba.deg.hun.df[,c(13,14,11,4)]
# V13 V14 V11 V4
# 5 gene_idLOC112268219 -34954 + 0.6905316
# 54 gene_idEPHA2 0 - -0.5679029
# 137 gene_idGPR199P -52810 - 0.5366557
# 204 gene_idSSBP3-AS1 -38695 + -0.6406612
# 243 gene_idLRRC53 -96957 - 0.5310484
# 310 gene_idLOC107985174 -22194 + 0.7628941
# 311 gene_idLOC107985174 -39678 + -0.6789142
# 340 gene_idLOC105369199 -46806 + 0.8299937
# 394 gene_idKCNJ10 -21376 - -0.6149127
# 395 gene_idKCNJ10 -79688 - 0.5773312
# 397 gene_idCD84 -27574 - -0.7511675
# 423 gene_idMETTL18 -14535 - -0.6883267
# 510 gene_idIRF6 -78572 - 0.9792439
# 511 gene_idIRF6 -90649 - 0.5796068
# 810 gene_idMAL -37133 + -1.1001047
# 1033 gene_idOSGEPL1-AS1 -33990 + 0.8046016
# 1042 gene_idRFTN2 0 - -1.2019779
# 1092 gene_idLOC101928156 -39118 + -0.8164184
# 1115 gene_idLOC107985994 0 - -0.5788280
# 1172 gene_idLOC107986008 -10794 - -0.6233095
# 1173 gene_idLOC107986008 -38730 - 0.5310272
# 1204 gene_idHDAC11-AS1 -76047 - -0.5199343
# 1306 gene_idLOC105377082 -26515 + -0.7178254
# 1314 gene_idLOC101929054 -74657 + -0.7298243
# 1431 gene_idLOC105374051 -93913 + 0.8303351
# 1461 gene_idLINC02614 -71991 - 0.8604230
# 1548 gene_idCLDN11 -49352 + -0.5855879
# 1608 gene_idAPOD -44292 - -0.6301554
# 1616 gene_idLOC105374340 -99834 - 0.5758361
# 1859 gene_idSNHG8 0 + 0.5219604
# 1895 gene_idLOC105379412 0 + -0.6582849
# 1939 gene_idTMEM144 0 + 0.5033154
# 2216 gene_idMIR583HG -11178 + 0.6073391
# 2217 gene_idMIR583HG -43455 + 0.7272555
# 2219 gene_idPCSK1 0 - 0.5285237 ***
# 2220 gene_idLOC105379094 -97641 - -0.9140922
# 2311 gene_idLOC107986454 -48851 - -0.5163421
# 2357 gene_idHAVCR1 -6207 - 0.6408532
# 2358 gene_idHAVCR1 -10711 - -0.5505262
# 2367 gene_idATP10B -98419 - -0.5746547
# 2432 gene_idRPP40 -10652 - 0.5180057
# 2433 gene_idRPP40 -92988 - -0.8351080
# 2502 gene_idH1-3 -77542 - 0.6368618
# 2524 gene_idFANCE -7103 + -0.6920337
# 2610 gene_idTENT5A -10433 - -0.9465523
# 2654 gene_idLOC105377937 -87883 + -0.7096149
# 2746 gene_idLOC105378052 -26443 + 0.7920928
# 3028 gene_idTAC1 -32931 + -0.5950813
# 3039 gene_idPLOD3 -55632 - 0.7886584
# 3040 gene_idPLOD3 -60873 - 1.1143256
# 3112 gene_idLOC105375492 0 - 0.5208297
# 3118 gene_idTPI1P2 -9230 + 0.6063698
# 3155 gene_idTBXAS1 0 + 0.5189381
# 3156 gene_idLOC105375534 -17652 - -0.6787270
# 3254 gene_idLOC105379321 -6930 + 0.5909078
# 3398 gene_idGEM -65106 - 0.8223359
# 3419 gene_idMIR3151 -87808 + -0.6865572
# 3444 gene_idENPP2 -65680 - -0.6225079
# 3475 gene_idLOC105375767 -82204 + 0.5283525
# 3476 gene_idLOC105375767 -96118 + 0.5534318
# 3520 gene_idARC -47591 - -0.5236479 ***
# 3521 gene_idARC -54560 - -0.5783056 ***
# 3523 gene_idHGH1 -43867 + -0.5783017
# 3687 gene_idMIRLET7F1 -5244 + -0.6640191
# 3778 gene_idLOC107987038 -99932 + -0.5542037
# 3786 gene_idAIF1L -91087 + -0.5129467
# 3821 gene_idADARB2 0 - 0.5077638
# 3866 gene_idCOMMD3 -952 + 0.7510124
# 3868 gene_idLOC107984215 -26010 - -0.5854215
# 3971 gene_idCTNNA3 0 - 0.6083859
# 3972 gene_idCTNNA3 0 - -0.5466261
# 3973 gene_idCTNNA3 0 - 0.5626638
# 3974 gene_idCTNNA3 0 - 0.8071776
# 3975 gene_idCTNNA3 0 - -0.5777344
# 3976 gene_idCTNNA3 -67324 - 0.5282024
# 4043 gene_idFFAR4 -9736 + -0.7243349
# 4084 gene_idDUSP5 -1189 + 0.7745962 ***
# 4134 gene_idNKX6-2 -51478 - -0.7215565
# 4135 gene_idLOC105378572 -39857 + 0.5064961
# 4182 gene_idLOC107984317 -73040 + 0.5157093
# 4199 gene_idRCN1 0 + 0.5442883
# 4236 gene_idFOLH1 -42565 - 0.7695809
# 4269 gene_idNPAS4 0 + 0.5208521 ***
# 4330 gene_idLOC105369421 -58585 - -0.5347686
# 4388 gene_idLOC100132686 -82458 + 0.6950240
# 4525 gene_idLOC107984516 -68083 + 0.7223717
# 4589 gene_idLOC105369774 -36336 + -0.5182956
# 4628 gene_idLOC105369843 -73560 - 0.7212664
# 4667 gene_idLINC01619 0 - 0.7933584
# 4668 gene_idLINC01619 0 - -0.5373097
# 5009 gene_idCMTM5 -88966 + 0.5090505
# 5025 gene_idRNU6-8 0 - 0.6641185
# 5052 gene_idLOC105370491 -70959 - -0.6726513
# 5174 gene_idLOC107984696 0 - 0.5006569
# 5249 gene_idSHC4 0 - -0.5091283 ***
# 5317 gene_idADPGK-AS1 -28858 + 0.5517442
# 5339 gene_idLOC107984750 0 - -1.1074450
# 5340 gene_idLOC107984750 -64330 - -0.6235255
# 5377 gene_idMIR5094 -47985 - -0.7409865
# 5386 gene_idVPS33B-DT -18554 + -0.6436861
# 5521 gene_idMMP2 -36817 + -0.9845256
# 5589 gene_idFLJ30679 -10124 + -0.9760091
# 5606 gene_idMIR132 -1057 - 0.6272284
# 5607 gene_idMIR132 -57479 - -0.6416787
# 5612 gene_idASPA -472 + -0.7965631
# 5689 gene_idEVI2A -45609 - -0.5202837
# 5690 gene_idEVI2A -82395 - -0.6109440
# 5719 gene_idLINC02594 0 + -0.5215768
# 5720 gene_idLINC02594 -51970 + -0.6464252
# 5783 gene_idTTYH2 -89586 + -0.5190569
# 5846 gene_idLOC105371970 -50103 - -0.5265102
# 5851 gene_idARHGAP28 0 + 0.5329917
# 5966 gene_idOACYLP 0 + 0.6608277
# 5967 gene_idOACYLP -53927 + 0.6460480
# 5980 gene_idCDH19 -28620 - 0.8598066
# 6001 gene_idLOC105372210 -53035 - -0.9246692
# 6011 gene_idRNU6-2 0 + 0.5832007
# 6024 gene_idZNF556 -42960 + -0.5777035
# 6041 gene_idTEX45 -88212 + -0.6630377
# 6107 gene_idHPN-AS1 0 - -0.6345851
# 6108 gene_idHPN-AS1 0 - -0.6119247
# 6116 gene_idPPP1R14A -25371 - -0.7335585
# 6117 gene_idPPP1R14A -44274 - -0.6519936
# 6137 gene_idAPOC1 -7624 + -0.5032215
# 6138 gene_idAPOC1 -32041 + -0.6201049
# 6185 gene_idZIM2 -53871 - -0.5957950
# 6264 gene_idC20orf203 -14254 - -0.5809369
# 6290 gene_idLOC102724968 0 - 0.5423262
# 6291 gene_idLOC102724968 0 - -0.6853386
# 6292 gene_idLOC102724968 -64627 - -0.5779236
# 6293 gene_idLOC102724968 -99294 - -0.5812571
# 6339 gene_idBCAS1 -26155 - -0.5275112
# 6340 gene_idCBLN4 -86171 - 0.5556032
# 6450 gene_idLOC107987299 -92308 - -0.6512881
# 6459 gene_idS100B 0 - -1.1802705
# 6515 gene_idCDC42EP1 -23108 + 0.5868131
# 6518 gene_idSOX10 -89705 - -0.5069107
# 6524 gene_idTNRC6B-DT -13867 - -0.5264739
# 6609 gene_idLOC105373181 -93866 + 0.5863989
# 6667 gene_idPLP1 0 + -0.5387689
# 6668 gene_idPLP1 -920 + -0.6682833
## what about the closest differential peak to each deg
bedtools2/bin/closestBed -D a -id -a RNAseq/gaba.male.deg.sorted.gtf -b Consensus.Peaks/ROTS.differential/rots.gaba.male.differential.igv.log2gr0.5.sorted.bed > gaba.male.closestpeaktodeg.bedtools.bed
bedtools2/bin/closestBed -D a -id -a RNAseq/gaba.female.deg.sorted.gtf -b Consensus.Peaks/ROTS.differential/rots.gaba.female.differential.igv.log2gr0.5.sorted.bed > gaba.female.closestpeaktodeg.bedtools.bed
library(dplyr)
library(tidyr)
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.deg <- read.delim("gaba.male.closestpeaktodeg.bedtools.bed", header=F, sep="\t")
gaba.deg.hun <- subset(gaba.deg, abs(gaba.deg$V14) < 100000)
gaba.deg.hun[,c(9,14)]
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC")
gaba.deg <- read.delim("gaba.female.closestpeaktodeg.bedtools.bed", header=F, sep="\t")
gaba.deg.hun <- subset(gaba.deg, abs(gaba.deg$V14) < 100000)
gaba.deg.hun[,c(9,14)]
# VGF violin plot of CPM values (all subjects + male/female subjects)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins
awk '{if ($1 == "NC_000007.14" && $2 >= 101162509 && $3 <= 101169956) print $0}' bin.matrix.txt > bin.matrix.VGF.txt
cat bin.matrix.header.txt bin.matrix.VGF.txt > bin.matrix.VGF.header.txt
# R
library(tidyr)
library(stringr)
library(reshape2)
library(ggplot2)
# sample / condition file
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/key.files")
sample <- read.delim("atac.csaw.key1.rmdups.txt", header=T, sep="\t", stringsAsFactors = F)
sample.id <- separate(sample, sample.name, c("sample", "seq"), sep="_")
# gaba control = 1:15,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88
# female = 73, 76, 4, 9, 10
# gaba heroin = 91:100,136,139,142,145,148,151,154,157,160,163,166,169,171,174,177
# female = 136, 151, 163
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/bins")
vgf <- read.delim("bin.matrix.VGF.header.txt", header=T, sep="\t", stringsAsFactors = F)
vgf.id <- vgf[,4:182]
vgf.id.t <- data.frame(t(vgf.id))
vgf.id.num <- data.matrix(vgf.id.t)
vgf.gaba.control <- vgf.id.num[c(1:15,46,49,52,55,58,61,64,67,70,73,76,79,82,85,88),]
vgf.gaba.heroin <- vgf.id.num[c(91:100,136,139,142,145,148,151,154,157,160,163,166,169,171,174,177),]
vgf.gaba.control.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.control, na.rm = FALSE, dims = 1))
vgf.gaba.heroin.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.heroin, na.rm = FALSE, dims = 1))
vgf.gaba.control.mean$condition <- "Control"
vgf.gaba.heroin.mean$condition <- "Heroin"
vgf.gaba.mean <- rbind(vgf.gaba.control.mean, vgf.gaba.heroin.mean)
library(reshape2)
pdf("VGF.atac.cpm.violin.pdf")
ggplot(vgf.gaba.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_violin() + theme_classic() + scale_fill_manual(values=c("#999999", "#E69F00"))
dev.off()
vgf.gaba.control.female <- vgf.id.num[c(4,9,10,73,76),]
vgf.gaba.control.male <- vgf.id.num[c(1:3,5:8,11:15,46,49,52,55,58,61,64,67,70,79,82,85,88),]
vgf.gaba.heroin.female <- vgf.id.num[c(136,151,163),]
vgf.gaba.heroin.male <- vgf.id.num[c(91:100,139,142,145,148,154,157,160,166,169,171,174,177),]
vgf.gaba.control.female.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.control.female, na.rm = FALSE, dims = 1))
vgf.gaba.control.male.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.control.male, na.rm = FALSE, dims = 1))
vgf.gaba.heroin.female.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.heroin.female, na.rm = FALSE, dims = 1))
vgf.gaba.heroin.male.mean <- data.frame(sample.mean = rowMeans(vgf.gaba.heroin.male, na.rm = FALSE, dims = 1))
vgf.gaba.control.female.mean$condition <- "Control"
vgf.gaba.control.male.mean$condition <- "Control"
vgf.gaba.heroin.female.mean$condition <- "Heroin"
vgf.gaba.heroin.male.mean$condition <- "Heroin"
vgf.gaba.control.female.mean$sex <- "Female"
vgf.gaba.control.male.mean$sex <- "Male"
vgf.gaba.heroin.female.mean$sex <- "Female"
vgf.gaba.heroin.male.mean$sex <- "Male"
vgf.gaba.sex.mean <- rbind(vgf.gaba.control.female.mean, vgf.gaba.control.male.mean, vgf.gaba.heroin.female.mean, vgf.gaba.heroin.male.mean)
pdf("VGF.atac.cpm.sex.violin.pdf")
ggplot(vgf.gaba.sex.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_violin() + theme_classic() + scale_fill_manual(values=c("#999999", "#E69F00")) + facet_grid(. ~ sex)
dev.off()
pdf("VGF.atac.cpm.sex.boxplot.pdf")
ggplot(vgf.gaba.sex.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_boxplot() + theme_classic() + scale_fill_manual(values=c("#999999", "#E69F00")) + facet_grid(. ~ sex)
dev.off()
pdf("VGF.atac.cpm.male.boxplot.pdf")
vgf.gaba.male.mean <- subset(vgf.gaba.sex.mean, vgf.gaba.sex.mean$sex == "Male")
ggplot(vgf.gaba.sex.mean, aes(x=condition, y=sample.mean, fill=condition)) + geom_boxplot() + theme_classic() + scale_fill_manual(values=c("#999999", "#E69F00"))
dev.off()
/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/Overlap/Exons_only
Ok.. completed the male and female analyses.
deseq analysis result files: /gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/Deseq2/Exons_only/bySex
edger analysis result files: /gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only/bySex
The results files for each sex separately have the naming convention: [Celltype]_results_[method]_age_rin_batch_group_[sex].csv
where [Celltype] is either ‘gaba’, ‘glu’, or ‘olig’, [method] is either ‘deseq2’ or ‘edgeR’ and [sex] is either ‘f’ or ‘m’.
There are also files for the intersections and unions of the deseq2 and edgeR results located at /gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/Overlap/Exons_only with naming convention: [Celltype]_age_rin_batch_group_[method]_bySex_[sex].csv where [Celltype] is either ‘gaba’, ‘glu’, or ‘olig’, [method] is either ‘intersection’ or ‘union’ and [sex] is either ‘f’ or ‘m’.
## From ATAC GABA analysis:
# NPAS4, SHC4, FOS, PLP1 <-- all found for differential ATAC in female but NOT in male. (OPPOSITE?????)
# VGF found in male NOT female.
# PCSK1 found in both (no sig expression changes... both ~ -0.5 log2fc)
# ** SHC4 in male is just below the threshold used for "differential atac" and the peak is in the opposite direction from that in female ** (not consistent result in expression data)
## Look in DE analysis:
cd /gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/Overlap/Exons_only
grep 'SHC4' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "","Gene","Significance","log2FC","Padj"
# "27027","SHC4","NO",-0.41980894785658,0.999935856108311
grep 'SHC4' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "447","SHC4","DOWN",-1.28431093050427,0.00829191057730395
grep 'NPAS4' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "23632","NPAS4","NO",-0.564203792495955,0.999935856108311
grep 'NPAS4' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "295","NPAS4","DOWN",-1.23358479057098,0.00691197542262688
grep 'FOS' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "26438","FOS","NO",0.797065158935434,0.999935856108311
grep 'FOS' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "106","FOS","DOWN",-2.00582903726307,0.00122215943535217
grep 'PLP1' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "1056","PLP1","NO",-3.28440914151853,0.240999798867713
grep 'PLP1' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "69","PLP1","DOWN",-1.74887789747449,0.00106461217253865
grep 'VGF' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "19652","VGF","NO",-0.0641814862059653,1
grep 'VGF' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "674","VGF","DOWN",-1.09275529120088,0.0149508904290993
grep 'PCSK1' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "8123","PCSK1","NO",-0.519245736073292,0.933051680240602
grep 'PCSK1' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "3352","PCSK1","NO",-0.597312058872925,0.131929450721691
grep 'DUSP6' Gabba_age_rin_batch_group_intersection_bySex_f.csv
# "24931","DUSP6","NO",-0.361117843930742,0.999935856108311
grep 'DUSP6' Gabba_age_rin_batch_group_intersection_bySex_m.csv
# "159","DUSP6","DOWN",-0.876241884453445,0.00402239511302101
# need to match chromosome ids...
cd /Users/27n/Dropbox (ORNL)/ATAC papers/ChIPseq/DEseq2_tables/M_or_F_peaks_separately
sed 's/chr10/NC_000010.11/g' GABA.M.ChIPseq.DEseq2.w_annot.txt | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > GABA.M.ChIPseq.DEseq2.w_annot.NCchr.txt
sed 's/chr10/NC_000010.11/g' GLU.M.ChIPseq.DEseq2.w_annot.txt | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > GLU.M.ChIPseq.DEseq2.w_annot.NCchr.txt
sed 's/chr10/NC_000010.11/g' Olig.M.ChIPseq.DEseq2.w_annot.txt | sed 's/chr11/NC_000011.10/g' | sed 's/chr12/NC_000012.12/g' | sed 's/chr13/NC_000013.11/g' | sed 's/chr14/NC_000014.9/g' | sed 's/chr15/NC_000015.10/g' | sed 's/chr16/NC_000016.10/g' | sed 's/chr17/NC_000017.11/g' | sed 's/chr18/NC_000018.10/g' | sed 's/chr19/NC_000019.10/g' | sed 's/chr20/NC_000020.11/g' | sed 's/chr21/NC_000021.9/g' | sed 's/chr22/NC_000022.11/g' | sed 's/chr1/NC_000001.11/g' | sed 's/chr2/NC_000002.12/g' | sed 's/chr3/NC_000003.12/g' | sed 's/chr4/NC_000004.12/g' | sed 's/chr5/NC_000005.10/g' | sed 's/chr6/NC_000006.12/g' | sed 's/chr7/NC_000007.14/g' | sed 's/chr8/NC_000008.11/g' | sed 's/chr9/NC_000009.12/g' | sed 's/chrX/NC_000023.11/g' | sed 's/chrY/NC_000024.10/g' > Olig.M.ChIPseq.DEseq2.w_annot.NCchr.txt
# summarise and overlap chip / atac datasets
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/Consensus.Peaks/ROTS.differential")
#setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ROTS.scripts")
gaba.atac <- read.delim("myresults_lower1_pfalse_gaba_male.bed", header=T, sep=" ")
glu.atac <- read.delim("myresults_lower1_pfalse_glu_male.bed", header=T, sep=" ")
olig.atac <- read.delim("myresults_lower1_pfalse_olig_male.bed", header=T, sep=" ")
## negative logfc = higher in heroin
nrow(gaba.atac)
# 54592
nrow(subset(gaba.atac, gaba.atac$logfc < -0.5 & gaba.atac$pvalue < 0.05)) # higher in heroin
# 339
nrow(subset(gaba.atac, gaba.atac$logfc > 0.5 & gaba.atac$pvalue < 0.05)) # higher in control
# 628
nrow(glu.atac)
# 93826
nrow(subset(glu.atac, glu.atac$logfc < -0.5 & glu.atac$pvalue < 0.05))
# 1111
nrow(subset(glu.atac, glu.atac$logfc > 0.5 & glu.atac$pvalue < 0.05))
# 1370
nrow(olig.atac)
# 49226
nrow(subset(olig.atac, olig.atac$logfc < -0.5 & olig.atac$pvalue < 0.05))
# 30
nrow(subset(olig.atac, olig.atac$logfc > 0.5 & olig.atac$pvalue < 0.05))
# 1468
gaba.atac$length <- abs(gaba.atac$Start - gaba.atac$End)
# mean = 956.0081
glu.atac$length <- abs(glu.atac$Start - glu.atac$End)
# 890.3504
olig.atac$length <- abs(olig.atac$Start - olig.atac$End)
# 910.0742
setwd("/Users/27n/Dropbox (ORNL)/ATAC papers/ChIPseq/DEseq2_tables/M_or_F_peaks_separately")
gaba.chip <- read.delim("GABA.M.ChIPseq.DEseq2.w_annot.NCchr.txt", header=T, sep="\t")
glu.chip <- read.delim("GLU.M.ChIPseq.DEseq2.w_annot.NCchr.txt", header=T, sep="\t")
olig.chip <- read.delim("Olig.M.ChIPseq.DEseq2.w_annot.NCchr.txt", header=T, sep="\t")
## negative logfc = higher in control
nrow(gaba.chip)
# 95892
nrow(subset(gaba.chip, gaba.chip$LFC < -0.5 & gaba.chip$pvalue < 0.05)) # higher in control
# 2940
nrow(subset(gaba.chip, gaba.chip$LFC > 0.5 & gaba.chip$pvalue < 0.05)) # higher in heroin
# 6312
nrow(glu.chip)
# 101868
nrow(subset(glu.chip, glu.chip$LFC < -0.5 & glu.chip$pvalue < 0.05))
# 3529
nrow(subset(glu.chip, glu.chip$LFC > 0.5 & glu.chip$pvalue < 0.05))
# 3222
nrow(olig.chip)
# 87022
nrow(subset(olig.chip, olig.chip$LFC < -0.5 & olig.chip$pvalue < 0.05))
# 1087
nrow(subset(olig.chip, olig.chip$LFC > 0.5 & olig.chip$pvalue < 0.05))
# 2788
gaba.chip$length <- abs(gaba.chip$start - gaba.chip$end)
# mean = 2162.64
glu.chip$length <- abs(glu.chip$start - glu.chip$end)
# 2042.097
olig.chip$length <- abs(olig.chip$start - olig.chip$end)
# 2112.721
# look at overlap
library(tidygenomics)
library(dplyr)
gaba.atac$epi.chr <- gaba.atac$Chr
gaba.atac$epi.start <- gaba.atac$Start
gaba.atac$epi.end <- gaba.atac$End
gaba.atac.label <- gaba.atac[,c(1:4,10,12,14:16)] %>% mutate(atac.call = ifelse(logfc < -0.5 & pvalue < 0.05, "higher.heroin", ifelse(logfc > 0.5 & pvalue < 0.05, "higher.control", "not.sig")))
gaba.atac.label %>% group_by(atac.call) %>% count()
# 1 higher.control 628
# 2 higher.heroin 339
# 3 not.sig 53625
gaba.chip$epi.chr <- gaba.chip$seqnames
gaba.chip$epi.start <- gaba.chip$start
gaba.chip$epi.end <- gaba.chip$end
gaba.chip.label <- gaba.chip[,c(1:3,6,8:9,11,19:21,24:26)] %>% mutate(chip.call = ifelse(LFC < -0.5 & pvalue < 0.05, "higher.control", ifelse(LFC > 0.5 & pvalue < 0.05, "higher.heroin", "not.sig")))
gaba.chip.label %>% group_by(chip.call) %>% count()
# 1 higher.control 2940
# 2 higher.heroin 6312
# 3 not.sig 86640
gaba.atac.chip <- genome_intersect(gaba.atac.label, gaba.chip.label, by=c("epi.chr", "epi.start", "epi.end"), mode="both")
# 43447 [54592 atac, 95892 chip]
mean(abs(gaba.atac.chip$epi.start - gaba.atac.chip$epi.end))
# 923.5519
gaba.atac.chip %>% group_by(atac.call, chip.call) %>% count()
# atac.call chip.call n
# <chr> <chr> <int>
# 1 higher.control higher.control 122
# 2 higher.control higher.heroin 3
# 3 higher.control not.sig 484
# 4 higher.heroin higher.control 2
# 5 higher.heroin higher.heroin 40
# 6 higher.heroin not.sig 149
# 7 not.sig higher.control 920
# 8 not.sig higher.heroin 5794
# 9 not.sig not.sig 35933
glu.atac$epi.chr <- glu.atac$Chr
glu.atac$epi.start <- glu.atac$Start
glu.atac$epi.end <- glu.atac$End
glu.chip$epi.chr <- glu.chip$seqnames
glu.chip$epi.start <- glu.chip$start
glu.chip$epi.end <- glu.chip$end
glu.atac.chip <- genome_intersect(glu.atac[,c(1:4,10,12,14:17)], glu.chip[,c(1:3,6,8:9,11,19:21,24:27)], by=c("epi.chr", "epi.start", "epi.end"), mode="both")
# 68093 [93826 atac, 101868 chip]
mean(abs(glu.atac.chip$epi.start - glu.atac.chip$epi.end))
# 882.6462
olig.atac$epi.chr <- olig.atac$Chr
olig.atac$epi.start <- olig.atac$Start
olig.atac$epi.end <- olig.atac$End
olig.chip$epi.chr <- olig.chip$seqnames
olig.chip$epi.start <- olig.chip$start
olig.chip$epi.end <- olig.chip$end
olig.atac.chip <- genome_intersect(olig.atac[,c(1:4,10,12,14:17)], olig.chip[,c(1:3,6,8:9,11,19:21,24:27)], by=c("epi.chr", "epi.start", "epi.end"), mode="both")
# 36072 [49226 atac, 87022 chip]
mean(abs(olig.atac.chip$epi.start - olig.atac.chip$epi.end))
# 863.2611
#### introduce rnaseq...
## Look in DE analysis:
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/Overlap/Exons_only/Gabba_age_rin_batch_group_intersection_bySex_m.csv /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq/.
setwd("/Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/RNAseq/")
gaba.rna <- read.delim("Gabba_age_rin_batch_group_intersection_bySex_m.csv", header=T, sep=",")
gaba.rna$SYMBOL <- gaba.rna$Gene
gaba.rna.label <- gaba.rna %>% mutate(rna.call = ifelse(log2FC < -0.5 & Padj < 0.05, "DOWN", ifelse(log2FC > 0.5 & Padj < 0.05, "UP", "no.sig")))
gaba.rna.epi <- left_join(gaba.atac.chip, gaba.rna.label, by="SYMBOL")
gaba.rna.epi.call <- gaba.rna.epi %>% group_by(atac.call, chip.call, rna.call) %>% count()
subset(gaba.rna.epi.call, gaba.rna.epi.call$rna.call == "UP")
# atac.call chip.call rna.call n
# <chr> <chr> <chr> <int>
# 1 higher.control not.sig UP 4
# 2 higher.heroin not.sig UP 1
# 3 not.sig higher.control UP 3
# 4 not.sig higher.heroin UP 24
# 5 not.sig not.sig UP 168
subset(gaba.rna.epi.call, gaba.rna.epi.call$rna.call == "DOWN")
# atac.call chip.call rna.call n
# <chr> <chr> <chr> <int>
# 1 higher.control higher.control DOWN 1
# 2 higher.control higher.heroin DOWN 1
# 3 higher.control not.sig DOWN 3
# 4 higher.heroin higher.heroin DOWN 1
# 5 not.sig higher.control DOWN 30
# 6 not.sig higher.heroin DOWN 32
# 7 not.sig not.sig DOWN 271
gaba.rna.chip <- left_join(gaba.chip.label, gaba.rna.label, by="SYMBOL")
gaba.rna.chip %>% group_by(chip.call, rna.call) %>% count()
# chip.call rna.call n
# <chr> <chr> <int>
# 1 higher.control DOWN 47 ###
# 2 higher.control no.sig 2033
# 3 higher.control UP 8
# 4 higher.control NA 852
# 5 higher.heroin DOWN 39
# 6 higher.heroin no.sig 5541
# 7 higher.heroin UP 25 ###
# 8 higher.heroin NA 707
# 9 not.sig DOWN 463
# 10 not.sig no.sig 72024
# 11 not.sig UP 251
# 12 not.sig NA 13902
gaba.rna.chip.call <- gaba.rna.chip %>% group_by(chip.call, rna.call, annotation) %>% count()
subset(gaba.rna.chip.call, gaba.rna.chip.call$chip.call == "higher.control" & gaba.rna.chip.call$rna.call == "DOWN")
# chip.call rna.call annotation n
# <chr> <chr> <chr> <int>
# higher.control DOWN Distal Intergenic 18 ###
# higher.control DOWN Exon 6
# higher.control DOWN Intron 14 ###
# higher.control DOWN Promoter (<=1kb) 8
# higher.control DOWN Promoter (1-2kb) 1
subset(gaba.rna.chip.call, gaba.rna.chip.call$chip.call == "higher.heroin" & gaba.rna.chip.call$rna.call == "UP")
# chip.call rna.call annotation n
# <chr> <chr> <chr> <int>
# 1 higher.heroin UP 3' UTR 1
# 2 higher.heroin UP Distal Intergenic 3
# 3 higher.heroin UP Exon (ENST00000624917.1/ENST00000624917.1, exon … 1
# 4 higher.heroin UP Promoter (<=1kb) 19 ###
# 5 higher.heroin UP Promoter (1-2kb) 1
#### genes of interest???
gaba.rna.epi %>% filter(chip.call == "higher.heroin") %>% filter(rna.call == "UP") %>% select(SYMBOL)
# 24
# [1] "DDIT4" "NXNL2" "FAM107A" "ZNF799" "FAM107A"
# [6] "MORN3" "PBX4" "LOC107986718" "GNG8" "FANCE"
# [11] "LOC105375216" "ATAD2" "VASH2" "TULP3" "TEX45"
# [16] "LINC02703" "LOC105376289" "LOC105374116" "LOC105376289" "MIR3191"
# [21] "ATAD2" "VASH2" "MIR3191" "RAI14"
gaba.rna.epi %>% filter(atac.call == "higher.heroin") %>% filter(rna.call == "UP") %>% select(SYMBOL)
# GNG8
gaba.rna.epi %>% filter(chip.call == "higher.control") %>% filter(rna.call == "DOWN") %>% select(SYMBOL)
# 31
# [1] "DUSP4" "DUSP6" "COL25A1" "MAP3K8" "NPAS4"
# [6] "FOS" "DUSP6" "GEM" "FOS" "FAM110C"
# [11] "EGR4" "SHC4" "HDAC4-AS1" "FOS" "SEMA3D"
# [16] "DKK2" "NPTX2" "MOBP" "SHC4" "HDAC4-AS1"
# [21] "CD84" "HDAC4-AS1" "FLT1" "NR4A1" "UBALD2"
# [26] "COL25A1" "LOC101927673" "LOC101927673" "MOBP" "DKK2"
# [31] "FAT1"
gaba.rna.epi %>% filter(atac.call == "higher.control") %>% filter(rna.call == "DOWN") %>% select(SYMBOL)
# SST, TRIB1, COL25A1, MFAP3L, RCN1
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/Overlap/Exons_only")
gaba.f <- read.delim("Gabba_age_rin_batch_group_intersection_bySex_f.csv", header=T, sep=",")
gaba.m <- read.delim("Gabba_age_rin_batch_group_intersection_bySex_m.csv", header=T, sep=",")
glu.f <- read.delim("Glu_age_rin_batch_group_intersection_bySex_f.csv", header=T, sep=",")
glu.m <- read.delim("Glu_age_rin_batch_group_intersection_bySex_m.csv", header=T, sep=",")
olig.f <- read.delim("Olig_age_rin_batch_group_intersection_bySex_f.csv", header=T, sep=",")
olig.m <- read.delim("Olig_age_rin_batch_group_intersection_bySex_m.csv", header=T, sep=",")
gaba.f$cell_type <- "GABA"
gaba.m$cell_type <- "GABA"
glu.f$cell_type <- "GLU"
glu.m$cell_type <- "GLU"
olig.f$cell_type <- "OLIG"
olig.m$cell_type <- "OLIG"
gaba.f$sex <- "female"
gaba.m$sex <- "male"
glu.f$sex <- "female"
glu.m$sex <- "male"
olig.f$sex <- "female"
olig.m$sex <- "male"
gaba <- rbind(gaba.f, gaba.m)
glu <- rbind(glu.f, glu.m)
olig <- rbind(olig.f, olig.m)
gaba.glu.olig <- rbind(gaba, glu, olig)
df <- gaba.glu.olig[,c(6,7,2,4,5)]
colnames(df) <- c("cluster", "Cell.type", "gene", "avg_log2FC", "p_val_adj")
# 169784
write.table(df, "mt.sinai.rnaseq.sex.shiny.csv", quote=F, row.names=F, sep=",")
generated by matt: /gpfs/alpine/syb105/proj-shared/Personal/lanemj/projects/mtSinaiOpioids
ssh or-slurm-login.ornl.gov ssh brut-login01.ornl.gov cd /lustre/or-scratch/cades-bsd/27n
### need to properly make HiC-Pro
cd /lustre/or-scratch/cades-bsd/27n
module load anaconda3
conda create -p /lustre/or-scratch/cades-bsd/27n/envs/HiCPro python=3.8
conda activate /lustre/or-scratch/cades-bsd/27n/envs/HiCPro
conda update -n base conda
conda install -c conda-forge scipy=1.7.0
#conda install -c conda-forge numpy=1.21.1 # loaded with scipy
conda install -c bioconda iced=0.5.10
conda install -c conda-forge -c bioconda bx-python=0.8.11 # failed... needed to add the -c conda-forge
conda install -c bioconda pysam=0.16.0.1
conda install -c bioconda cooler=0.8.11 # failed
conda install -c conda-forge r-base=4.0.3
conda install -c conda-forge r-ggplot2=3.3.5
conda install -c conda-forge r-rcolorbrewer=1.1_2
conda install -c conda-forge r-gridbase=0.4_7
conda install -c conda-forge tbb=2020.2
conda install -c bioconda bowtie2=2.4.4
conda install -c bioconda samtools=1.12
conda install -c bioconda multiqc=1.11
conda install -c conda-forge gxx
#cd /lustre/or-scratch/cades-bsd/27n
#module load anaconda3
#git clone https://github.com/nservant/HiC-Pro.git
vim /lustre/or-scratch/cades-bsd/27n/HiC-Pro/config-install.txt
#PREFIX = /lustre/or-scratch/cades-bsd/27n
#BOWTIE2_PATH = /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2
#SAMTOOLS_PATH = /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools
#R_PATH = /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/R
#PYTHON_PATH = /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/python3.8
#CLUSTER_SYS = SLURM
cd /lustre/or-scratch/cades-bsd/27n/HiC-Pro/
conda activate /lustre/or-scratch/cades-bsd/27n/envs/HiCPro
make configure
make install
# edit config-hicpro.txt
# slurm commands, bowtie2 index, genome file, chr file, restriction file
# move genome and chr file to annotation directory
# /lustre/or-scratch/cades-bsd/27n/HiC-Pro/annotation/.
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
# move hic data into fastq/opioid.samples directory (one folder for each sample)
# /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/HiC/
#### Test dataset
## Get the data. Will download a test_data folder and a configuration file
mkdir /lustre/or-scratch/cades-bsd/27n/test.data/fastq/
cd /lustre/or-scratch/cades-bsd/27n/test.data/fastq/
wget https://zerkalo.curie.fr/partage/HiC-Pro/HiCPro_testdata.tar.gz && tar -zxvf HiCPro_testdata.tar.gz
## Run HiC-Pro
cd /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0
time /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/test.data/fastq/test_data -o /lustre/or-scratch/cades-bsd/27n/test.data/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt
# Run HiC-Pro 3.1.0
# --------------------------------------------
# Tue Mar 29 07:58:29 EDT 2022
# Bowtie2 alignment step1 ...
# Logs: logs/dixon_2M_2/mapping_step1.log
# [main_samview] fail to read the header from "-".
# [main_samview] fail to read the header from "-".
# Exit: Error in reads alignment - Exit
# make: *** [/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/../scripts//Makefile:88: bowtie_global] Error 1
# edit config-hicpro.txt file --> add email, job name/memory/wall time, remove job queue, and adjust file extension (.R1 instead of _R1)
# move GR38 annotations into directory and adjust in config script
## scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.chr.size /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/.
## scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.fna /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/.
## scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/bowtie2/* /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/.
conda activate /lustre/or-scratch/cades-bsd/27n/envs/HiCPro
cd /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
# Run HiC-Pro 3.1.0 parallel mode
# The following command will launch the parallel workflow through 24 torque jobs:
# sbatch HiCPro_step1_hicpro.sh
# The following command will merge the processed data and run the remaining steps per sample:
# sbatch HiCPro_step2_hicpro.sh
### configure specs for BRUT... HiCPro_step1_hicpro.sh --> HiCPro_step1_hicpro_brut.sh
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --ntasks-per-socket=2
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=HiCpro_s1_hicpro
#SBATCH --export=ALL
#SBATCH --array=1-24
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro
sbatch HiCPro_step1_hicpro_brut.sh
sbatch HiCPro_step2_hicpro.sh
#### jobs killed after 10? days of running...
##HiC-Pro mapping
Use of uninitialized value $l in scalar chomp at /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 line 588, <BT> line 495316969.
Use of uninitialized value $l in substitution (s///) at /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 line 589, <BT> line 495316969.
(ERR): bowtie2-align died with signal 7 (BUS)
## try to test if it will run bowtie2 (not from config submit script specifications)
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372
sbatch test.bowtie.sh #job 8839 submitted 2:57p 4/12/22
### this completed at 8:31pm and appears to have processed correctly... what is going on?
### run command for all samples and then start HiC-Pro after mapping step??
–> when I submit all of the jobs it is VERY VERY slow… 14hrs and not even 6G run? –> try canceling all but one job and let that run through? Might be working better…
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=gaba372.bowtie.test
#SBATCH --export=ALL
#/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L001_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L001_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2.fastq 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L002_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L002_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L003_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L003_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L004_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA372_AGTCAA_HWJW2DSXY_L004_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA372/GABA372_AGTCAA_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.bam
# sbatch /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA372/GABA372.bowtie.sh
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=gaba376.bowtie.test
#SBATCH --export=ALL
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L001_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L001_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L002_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L002_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L003_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L003_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L004_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:GABA376_CCGTCC_HWJW2DSXY_L004_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/GABA376/GABA376_CCGTCC_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.bam
# sbatch /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/GABA376/GABA376.bowtie.sh
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=glu372.bowtie.test
#SBATCH --export=ALL
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L001_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L001_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L002_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L002_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L003_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L003_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L004_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu372_CTTGTA_HWJW2DSXY_L004_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu372/Glu372_CTTGTA_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.bam
# sbatch /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu372/Glu372.bowtie.sh
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=glu376.bowtie.test
#SBATCH --export=ALL
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L001_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L001_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L002_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L002_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L003_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L003_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L004_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:Glu376_ATGTCA_HWJW2DSXY_L004_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/Glu376/Glu376_ATGTCA_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.bam
# sbatch /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/Glu376/Glu376.bowtie.sh
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=olig372.bowtie.test
#SBATCH --export=ALL
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L001_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L001_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L002_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L002_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L003_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L003_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L004_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG372_AGTTCC_HWJW2DSXY_L004_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.bam
# sbatch /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG372/OLIG372.bowtie.sh
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=olig376.bowtie.test
#SBATCH --export=ALL
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L001_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L001_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L001_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L002_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L002_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L002_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L003_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L003_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L003_001.R2_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L004_001.R1 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R1.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R1_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R1_GRCh38.p13.bt2.bwt2glob.bam
/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/bowtie2 --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder --un /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.unmap.fastq --rg-id BMG --rg SM:OLIG376_GTCCGC_HWJW2DSXY_L004_001.R2 -p 240 -x /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/GRCh38.p13.bt2 -U /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/rawdata/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R2.fastq.gz 2>> /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R2_bowtie2.log| /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/bin/samtools view -F 4 -bS - > /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L004_001.R2_GRCh38.p13.bt2.bwt2glob.bam
# sbatch /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/logs/OLIG376/OLIG376.bowtie.sh
# mapping, proc_hic, quality_checks, merge_persample, build_contact_maps, ice_norm
## mapping [DON'T RUN]
# cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
#
# /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s mapping
#
# cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/opioid.samples
# sbatch HiCPro_step2_hicpro.sh
# # change to HiCPro_step2_hicpro_map.sh
## processing **** doesn't work with just traditional bowtie2 output... need multi-step processing before this step...
** need to load samtools sort, index and view bam files and then run
# /home/27n/.conda/envs/HiCPro/bin/samtools sort *.bam
# /home/27n/.conda/envs/HiCPro/bin/samtools index *.bam
# /home/27n/.conda/envs/HiCPro/bin/samtools view -c *.bam
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s proc_hic
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro
# sbatch HiCPro_step1_hicpro.sh
# change to HiCPro_step1_hicpro_proc.sh
sbatch HiCPro_step1_hicpro_proc.sh
## quality checks
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s quality_checks
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/bwt2_global -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s quality_checks
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
# sbatch HiCPro_step2_hicpro.sh
# change to HiCPro_step2_hicpro_QC.sh
sbatch HiCPro_step2_hicpro_QC.sh
## merge
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/<INPUT_.validPairs > -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s merge_persample
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro
## contact maps
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/<INPUT_.validPairs > -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s build_contact_maps
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro
## ICE normalization
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro/bowtie_results/<INPUT_matrix> -o /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s ice_norm
cd /lustre/or-scratch/cades-bsd/27n/opioid.hic/hicpro
#cp opioid.hic/fastq/GABA372/GABA372_AGTCAA_HWJW2DSXY_L00* gaba372.hic/fastq/GABA372/.
#cp opioid.hic/fastq/GABA376/GABA376_CCGTCC_HWJW2DSXY_L00* gaba376.hic/fastq/GABA376/.
#cp opioid.hic/fastq/Glu376/Glu376_ATGTCA_HWJW2DSXY_L001* glu376.hic/fastq/GLU376/.
#cp opioid.hic/fastq/Glu372/Glu372_CTTGTA_HWJW2DSXY_L00* glu372.hic/fastq/GLU372/.
#cp opioid.hic/fastq/OLIG376/OLIG376_GTCCGC_HWJW2DSXY_L00* olig376.hic/fastq/OLIG376/.
#cp opioid.hic/fastq/OLIG372/OLIG372_AGTTCC_HWJW2DSXY_L00* olig372.hic/fastq/OLIG372/.
# GABA372
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh
sbatch HiCPro_step2_hicpro.sh
# head /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/logs/GABA372/mergeSAM.log
### [E::idx_find_and_load] Could not retrieve index file for 'bowtie_results/bwt2/GABA372/GABA372_AGTCAA_HWJW2DSXY_L002_001.R1_GRCh38.p13.bt2.bwt2merged.bam'
# stopping at restriction sites...
### need to generate a proper restriction enzyme file (^GATC and G^ANTC)
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/digest_genome.py -r ^GATC G^ANTC -o /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/hg38.arima.restriction.bed /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/GCF_000001405.39_GRCh38.p13_genomic.fna
# need to adjust restriction site file in /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt
## LIGATION_SITE = GATCGATC,GANTCGANTC
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/bowtie_results/bwt2 -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s proc_hic
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/data -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s quality_checks -s merge_persample -s build_contact_maps -s ice_norm
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/
sbatch HiCPro_step2_hicpro.sh # Running: 9 May
# GABA376
cd /lustre/or-scratch/cades-bsd/27n/gaba376.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/gaba376.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
cd /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh
sbatch HiCPro_step2_hicpro.sh
cd /lustre/or-scratch/cades-bsd/27n/gaba376.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/bowtie_results/bwt2 -o /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p -s proc_hic -s quality_checks -s merge_persample -s build_contact_maps -s ice_norm
cd /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh # Submitted: 9 May
sbatch HiCPro_step2_hicpro.sh # Submitted: 9 May
# GLU376
cd /lustre/or-scratch/cades-bsd/27n/glu376.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/glu376.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
cd /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh # Submitted: 9 May
sbatch HiCPro_step2_hicpro.sh # Submitted: 9 May
# GLU372
cd /lustre/or-scratch/cades-bsd/27n/glu372.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/glu372.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
cd /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh # Submitted: 9 May
sbatch HiCPro_step2_hicpro.sh # Submitted: 9 May
# OLIG376
cd /lustre/or-scratch/cades-bsd/27n/olig376.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/olig376.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
cd /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh # Submitted: 9 May
sbatch HiCPro_step2_hicpro.sh # Submitted: 9 May
# OLIG372
cd /lustre/or-scratch/cades-bsd/27n/olig372.hic
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/HiC-Pro -i /lustre/or-scratch/cades-bsd/27n/olig372.hic/fastq -o /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro -c /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt -p
cd /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/
sbatch HiCPro_step1_hicpro.sh # Submitted: 9 May
sbatch HiCPro_step2_hicpro.sh # Submitted: 9 May
# globus move all HiC-Pro output files to OLCF from CADES
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
# adjust BIN_SIZE in config-hicpro file
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro
# scp /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/config-hicpro.txt noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/.
# scp /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/hg38.arima.restriction.bed noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation/.
# scp /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/bt2/* noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation/.
# then adjust config-hicpro file for OLCF system specifics
# gaba372
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/
vim HiCPro_step2_hicpro.sh ## add #SBATCH -A SYB105
sbatch HiCPro_step2_hicpro.sh
# gaba376
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/hic_results/data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/
sbatch HiCPro_step2_hicpro.sh
# glu372
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic/hicpro/hic_results/data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic/hicpro/
sbatch HiCPro_step2_hicpro.sh
# glu376
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic/hicpro/hic_results/data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic/hicpro/
sbatch HiCPro_step2_hicpro.sh
# olig372
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic/hicpro/hic_results/data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic/hicpro/
sbatch HiCPro_step2_hicpro.sh
# olig376
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/HiC-Pro -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic/hicpro/hic_results/data -o /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic/hicpro -c /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/config-hicpro.txt -p -s build_contact_maps -s ice_norm
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic/hicpro/
sbatch HiCPro_step2_hicpro.sh
ssh or-slurm-login.ornl.gov
ssh brut-login01.ornl.gov
cd /lustre/or-scratch/cades-bsd/27n
# conda create --name fithic -c bioconda fithic
# conda install -c conda-forge numpy scipy scikit-learn sortedcontainers matplotlib
conda activate fithic
# cd /lustre/or-scratch/cades-bsd/27n
# git clone https://github.com/ay-lab/fithic.git
# cd /lustre/or-scratch/cades-bsd/27n/fithic
# ./fithic/tests/run_tests-git.sh
conda activate fithic
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/
python fithic.py --ARGUMENTS
# #### not working from my bioconda environment...
# # Try 2 thins...
# ## 1: use bioconductor-fithic bioconda
# conda activate fithic
# conda install -c bioconda bioconductor-fithic
#
# ## 2: use Kyle's on Andes... transfer data from BRUT to OLCF with globus
# ## Source Collection: CADES-OR --> Path: /lustre/or-scratch/cades-bsd/27n/
# ### transfer directories: gaba372.hic, gaba376.hic, glu372.hic, glu376.hic, olig372.hic, olig376.hic
# ## End Collection: OLCF DTN --> Path: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/
# ## Task ID: b532dbbe-ddc0-11ec-b5c4-79437d2253ac
#
# /gpfs/alpine/syb105/proj-shared/Personal/sullivanka/libraries/dtn/anaconda3/envs/fithic
convert files to proper fithic input with… https://github.com/nservant/HiC-Pro/blob/master/bin/utils/hicpro2fithic.py
http://nservant.github.io/HiC-Pro/UTILS.html#hicpro2fithic-py https://bioconductor.org/packages/release/bioc/vignettes/FitHiC/inst/doc/fithic.html
conda activate /lustre/or-scratch/cades-bsd/27n/envs/HiCPro
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/GABA372/raw/20000/GABA372_20000.matrix -b hic_results/matrix/GABA372/raw/20000/GABA372_20000_abs.bed -s hic_results/matrix/GABA372/iced/20000/GABA372_20000_iced.matrix.biases
# conda activate fithic
# conda install -c conda-forge python=3.8
# conda install -c conda-forge r-base=4.1
# mkdir fithic
# python /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -f fithic.fragmentMappability.gz -i fithic.interactionCounts.gz -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic -r 20000 -t fithic.biases.gz
conda activate fithic
# R
library(FitHiC)
# load interactions, fragments, and bias files
intersfile <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz"
fragsfile <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz"
bias <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.gz"
outdir <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/"
# run FitHiC with and without bias normalization
FitHiC(fragsfile, intersfile, outdir, libname="GABA372_NoBiasFile", distUpThres=2000000, distLowThres=20000, visual = TRUE)
FitHiC(fragsfile, intersfile, outdir, biasfile = bias, libname="GABA372_BiasCorrection", distUpThres=2000000, distLowThres=20000, visual = TRUE)
### zcat < GABA372_BiasCorrection.spline_pass2.significances.txt.gz | head
# chr1 fragmentMid1 chr2 fragmentMid2 contactCount p_value q_value
# NC_000001.11 10000 NC_000001.11 70000 1 1 1
# NC_000001.11 50000 NC_000001.11 90000 1 1 1
# NC_000001.11 50000 NC_000001.11 110000 1 1 1
# NC_000001.11 50000 NC_000001.11 130000 2 1 1
# NC_000001.11 70000 NC_000001.11 130000 1 1 1
# NC_000001.11 90000 NC_000001.11 130000 1 1 1
# NC_000001.11 10000 NC_000001.11 190000 4 1 1
# NC_000001.11 90000 NC_000001.11 270000 2 1 1
# NC_000001.11 110000 NC_000001.11 270000 2 1 1
#### returning pval and qval = 1 ????
#### try with HiC-Pro output files
intersfile <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/matrix/GABA372/raw/20000/GABA372_20000.cp.matrix.gz"
fragsfile <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/matrix/GABA372/raw/20000/GABA372_20000_abs.cp.bed.gz"
biasfile <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/matrix/GABA372/iced/20000/GABA372_20000_iced.cp.matrix.biases.gz"
outdir <- "/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/"
FitHiC(fragsfile, intersfile, outdir, biasfile, libname="GABA372_useHiCPro", distUpThres=2000000, distLowThres=20000, visual=TRUE, useHiCPro=TRUE)
### still not gettting bias columns... try with python version instead of R version
######## on BRUT command line ########
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz -t /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.gz -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/ -r 20000 --lib GABA372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
# GIVEN FIT-HI-C ARGUMENTS
# =========================
# Reading fragments file from: /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz
# Reading interactions file from: /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz
# Output path being used from /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/
# Fixed size option detected... Fast version of FitHiC will be used
# Resolution is 20.0 kb
# Reading bias file from: /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases
# The number of spline passes is 2
# The number of bins is 100
# The number of reads required to consider an interaction is 1
# The name of the library for outputted files will be GABA372_fithic_python
# Upper Distance threshold is 2000000
# Lower Distance threshold is 20000
# Graphs will be outputted
# Only intra-chromosomal regions will be analyzed
# Lower bound of bias values is 0.5
# Upper bound of bias values is 2
# All arguments processed. Running FitHiC now...
# =========================
# Reading the contact counts file to generate bins...
# Interactions file read. Time took 196.90621900558472
# Fragments file read. Time took 0.4247448444366455
# Traceback (most recent call last):
# File "/lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py", line 1324, in <module>
# main()
# File "/lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py", line 327, in main
# biasDic = read_biases(biasFile)
# File "/lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py", line 808, in read_biases
# chrom=words[0]; midPoint=int(words[1]); bias=float(words[2])
# IndexError: list index out of range
# --> issue with bias file??? but it is being read in just fine... add print(words) after words is defined above error line
# ['NC_000001.11', '10000', '0.08434859430837358']
# []
#### why is it adding an extra line after every line in the bias file??? edit file with: grep -v ‘^$’ fithic.biases > fithic.biases.corrected OR in the python code add: if line == '\n': continue
### To get proper output need to run with python script... R script does not provide bias1, bias2, ExpCC columns
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.corrected.gz --lib GABA372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
# Spline fit Pass 1 starting...
# Plotting /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_python.spline_pass1.png
# Outlier threshold is... 3.514419663880903e-08
# Writing p-values and q-values to file /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_python.spline_pass1.significances.txt
# Plotting q-values to file /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_python.spline_pass1.qplot.png
# Number of outliers is... 6807
# Spline fit Pass 1 completed. Time took 772.8441739082336
## run without bias file
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/ -r 20000 --lib GABA372_fithic_nobias_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
# GIVEN FIT-HI-C ARGUMENTS
# =========================
# Reading fragments file from: /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz
# Reading interactions file from: /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz
# Output path being used from /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/
# Fixed size option detected... Fast version of FitHiC will be used
# Resolution is 20.0 kb
# No bias file
# The number of spline passes is 2
# The number of bins is 100
# The number of reads required to consider an interaction is 1
# The name of the library for outputted files will be GABA372_fithic_nobias_python
# Upper Distance threshold is 2000000
# Lower Distance threshold is 20000
# Graphs will be outputted
# Only intra-chromosomal regions will be analyzed
# Lower bound of bias values is 0.5
# Upper bound of bias values is 2
# All arguments processed. Running FitHiC now...
# =========================
# Reading the contact counts file to generate bins...
# Interactions file read. Time took 192.01878547668457
# Fragments file read. Time took 0.4195888042449951
# Writing /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_nobias_python.fithic_pass1.res20000.txt
# Spline fit Pass 1 starting...
# Plotting /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_nobias_python.spline_pass1.png
# Outlier threshold is... 3.514419663880903e-08
# Writing p-values and q-values to file /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_nobias_python.spline_pass1.significances.txt
# Plotting q-values to file /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/GABA372_fithic_nobias_python.spline_pass1.qplot.png
# Number of outliers is... 24262
# Spline fit Pass 1 completed. Time took 761.3818807601929
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --time=48-0:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --mail-user=noshayjm@ornl.gov
#SBATCH --mail-type=end
#SBATCH --job-name=fithic
#SBATCH --export=ALL
# generate fithic input files
conda activate /lustre/or-scratch/cades-bsd/27n/envs/HiCPro
#cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro
#/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/GABA372/raw/20000/GABA372_20000.matrix -b hic_results/matrix/GABA372/raw/20000/GABA372_20000_abs.bed -s hic_results/matrix/GABA372/iced/20000/GABA372_20000_iced.matrix.biases
cd /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/GABA376/raw/20000/GABA376_20000.matrix -b hic_results/matrix/GABA376/raw/20000/GABA376_20000_abs.bed -s hic_results/matrix/GABA376/iced/20000/GABA376_20000_iced.matrix.biases
cd /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/GLU372/raw/20000/GLU372_20000.matrix -b hic_results/matrix/GLU372/raw/20000/GLU372_20000_abs.bed -s hic_results/matrix/GLU372/iced/20000/GLU372_20000_iced.matrix.biases
cd /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/GLU376/raw/20000/GLU376_20000.matrix -b hic_results/matrix/GLU376/raw/20000/GLU376_20000_abs.bed -s hic_results/matrix/GLU376/iced/20000/GLU376_20000_iced.matrix.biases
cd /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/OLIG372/raw/20000/OLIG372_20000.matrix -b hic_results/matrix/OLIG372/raw/20000/OLIG372_20000_abs.bed -s hic_results/matrix/OLIG372/iced/20000/OLIG372_20000_iced.matrix.biases
cd /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2fithic.py -i hic_results/matrix/OLIG376/raw/20000/OLIG376_20000.matrix -b hic_results/matrix/OLIG376/raw/20000/OLIG376_20000_abs.bed -s hic_results/matrix/OLIG376/iced/20000/OLIG376_20000_iced.matrix.biases
# run fithic
source activate fithic
cd /lustre/or-scratch/cades-bsd/27n/fithic
# need to correct fithic.biases files with following code for each sample
# gunzip /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.gz
# sed '/^$/d' /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases > /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.corrected
# gzip /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.corrected
# gunzip /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.biases.gz
# sed '/^$/d' /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.biases > /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.biases.corrected
# gzip /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.biases.corrected
gunzip /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.biases.gz
sed '/^$/d' /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.biases > /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.biases.corrected
gzip /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.biases.corrected
gunzip /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.biases.gz
sed '/^$/d' /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.biases > /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.biases.corrected
gzip /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.biases.corrected
gunzip /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.biases.gz
sed '/^$/d' /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.biases > /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.biases.corrected
gzip /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.biases.corrected
gunzip /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.biases.gz
sed '/^$/d' /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.biases > /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.biases.corrected
gzip /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.biases.corrected
#mkdir /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/
#python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic.biases.corrected.gz --lib GABA372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
mkdir /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic/
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/gaba376.hic/hicpro/fithic.biases.corrected.gz --lib GABA376_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
mkdir /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic/
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/glu372.hic/hicpro/fithic.biases.corrected.gz --lib GLU372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
mkdir /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic/
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/glu376.hic/hicpro/fithic.biases.corrected.gz --lib GLU376_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
mkdir /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic/
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/olig372.hic/hicpro/fithic.biases.corrected.gz --lib OLIG372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
mkdir /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic/
python3 /lustre/or-scratch/cades-bsd/27n/fithic/fithic/fithic.py -i /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.interactionCounts.gz -f /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.fragmentMappability.gz -r 20000 -o /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic/ -t /lustre/or-scratch/cades-bsd/27n/olig376.hic/hicpro/fithic.biases.corrected.gz --lib OLIG376_fithic_python --passes 2 --upperbound 2000000 --lowerbound 20000 --visual
# sbatch /lustre/or-scratch/cades-bsd/27n/fithic.sh
# The first file [GABA372_BiasCorrection.fithic_pass2.txt] will report the results of equal occupancy binning in five fields (avgGenomicDistance, contactProb, stdErr, numLocusPairs, CCtotal)
# The second file will have the exact same lines as in the input file that contains the list of mid-range contacts. This input file had 5 fields as described above. The output from each step will append... p-value, q-value, bias1, bias2, ExpCC
## p-value: p-value of the corresponding interaction, as computed by the binomial distribution model employed in FitHiC.
## q-value: q-value or FDR obtained by applying Benjamini-Hochberg correction to the p-values.
## bias1: Bias value of the first interacting fragment.
## bias2: Bias value of the second interacting fragment.
## ExpCC: Expected contact count of the current interaction, computed using the raw contact count, spline fit probability of the raw contact count (with respect to the loop distance), and the given bias values. Enrichment of the raw (observed) contact count with respect to the expected contact count is reflected in the q-value.
# Hyejung post-processing
ssh or-slurm-login.ornl.gov
ssh brut-login01.ornl.gov
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/fithic
#pip install imgcat
imgcat local_image.png
# ===In Linux===
# zcat GABA372_NoBiasFile.spline_pass2.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA372_NoBiasFile.distance.txt # all interactions: total line= 8724585
# awk '{if($10<=1000000 && $1==$3) print}' GABA372_NoBiasFile.distance.txt > GABA372_NoBiasFile.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=8724585
# awk '{if($6<0.01) print}' GABA372_NoBiasFile.1Mbrange.txt > GABA372_NoBiasFile.1Mbrange.P0.01.txt
#
# zcat GABA372_BiasCorrection.spline_pass2.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA372_BiasCorrection.distance.txt # all interactions: total line= 8724585
# awk '{if($10<=1000000 && $1==$3) print}' GABA372_BiasCorrection.distance.txt > GABA372_BiasCorrection.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=8724584
# awk '{if($6<0.01) print}' GABA372_BiasCorrection.1Mbrange.txt > GABA372_BiasCorrection.1Mbrange.P0.01.txt
#
# zcat GABA372_useHiCPro.spline_pass2.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA372_useHiCPro.distance.txt # all interactions: total line= 8724585
# awk '{if($10<=1000000 && $1==$3) print}' GABA372_useHiCPro.distance.txt > GABA372_useHiCPro.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=8724584
# awk '{if($6<0.01) print}' GABA372_useHiCPro.1Mbrange.txt > GABA372_useHiCPro.1Mbrange.P0.01.txt
zcat GABA372_fithic_nobias_python.spline_pass2.res20000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA372_fithic_nobias_python.distance.txt # all interactions: total line= 8861805
awk '{if($10<=1000000 && $1==$3) print}' GABA372_fithic_nobias_python.distance.txt > GABA372_fithic_nobias_python.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=5575383
awk '{if($6<0.01) print}' GABA372_fithic_nobias_python.1Mbrange.txt > GABA372_fithic_nobias_python.1Mbrange.P0.01.txt # 409953
zcat GABA372_fithic_python.spline_pass2.res20000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA372_fithic_python.distance.txt # all interactions: total line= 8861805
awk '{if($10<=1000000 && $1==$3) print}' GABA372_fithic_python.distance.txt > GABA372_fithic_python.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=5575383
awk '{if($6<0.01) print}' GABA372_fithic_python.1Mbrange.txt > GABA372_fithic_python.1Mbrange.P0.01.txt # 262665
# ===In R===
conda activate fithic
bedfile = read.table("GABA372_fithic_nobias_python.1Mbrange.P0.01.txt")
colnames(bedfile) = c("chr1","int1","chr2","int2","freq","P","Q","bias1","bias2","dist")
bedfile = bedfile[order(bedfile$P), ]
bedfile$order = 1:nrow(bedfile)
bedfile$fdr = bedfile$P * 5575383 / bedfile$order # line number of 1Mbrange.txt
bedfile = bedfile[bedfile$fdr<0.01, ]
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J fithic
#SBATCH -N 1
#SBATCH -p batch
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
#/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
# generate fithic input files
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd gaba372.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2fithic.py -i hic_results/matrix/GABA372/raw/5000/GABA372_5000.matrix -b hic_results/matrix/GABA372/raw/5000/GABA372_5000_abs.bed -s hic_results/matrix/GABA372/iced/5000/GABA372_5000_iced.matrix.biases
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd gaba376.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2fithic.py -i hic_results/matrix/GABA376/raw/5000/GABA376_5000.matrix -b hic_results/matrix/GABA376/raw/5000/GABA376_5000_abs.bed -s hic_results/matrix/GABA376/iced/5000/GABA376_5000_iced.matrix.biases
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd glu372.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2fithic.py -i hic_results/matrix/GLU372/raw/5000/GLU372_5000.matrix -b hic_results/matrix/GLU372/raw/5000/GLU372_5000_abs.bed -s hic_results/matrix/GLU372/iced/5000/GLU372_5000_iced.matrix.biases
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd glu376.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2fithic.py -i hic_results/matrix/GLU376/raw/5000/GLU376_5000.matrix -b hic_results/matrix/GLU376/raw/5000/GLU376_5000_abs.bed -s hic_results/matrix/GLU376/iced/5000/GLU376_5000_iced.matrix.biases
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd olig372.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2fithic.py -i hic_results/matrix/OLIG372/raw/5000/OLIG372_5000.matrix -b hic_results/matrix/OLIG372/raw/5000/OLIG372_5000_abs.bed -s hic_results/matrix/OLIG372/iced/5000/OLIG372_5000_iced.matrix.biases
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd olig376.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2fithic.py -i hic_results/matrix/OLIG376/raw/5000/OLIG376_5000.matrix -b hic_results/matrix/OLIG376/raw/5000/OLIG376_5000_abs.bed -s hic_results/matrix/OLIG376/iced/5000/OLIG376_5000_iced.matrix.biases
# run fithic
# conda create --name fithic -c bioconda fithic
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic
#git clone https://github.com/ay-lab/fithic.git
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
# need to correct fithic.biases files with following code for each sample
gunzip gaba372.hic/hicpro/fithic.biases.gz
sed '/^$/d' gaba372.hic/hicpro/fithic.biases > gaba372.hic/hicpro/fithic.biases.corrected
gzip gaba372.hic/hicpro/fithic.biases.corrected
gunzip gaba376.hic/hicpro/fithic.biases.gz
sed '/^$/d' gaba376.hic/hicpro/fithic.biases > gaba376.hic/hicpro/fithic.biases.corrected
gzip gaba376.hic/hicpro/fithic.biases.corrected
gunzip glu372.hic/hicpro/fithic.biases.gz
sed '/^$/d' glu372.hic/hicpro/fithic.biases > glu372.hic/hicpro/fithic.biases.corrected
gzip glu372.hic/hicpro/fithic.biases.corrected
gunzip glu376.hic/hicpro/fithic.biases.gz
sed '/^$/d' glu376.hic/hicpro/fithic.biases > glu376.hic/hicpro/fithic.biases.corrected
gzip glu376.hic/hicpro/fithic.biases.corrected
gunzip olig372.hic/hicpro/fithic.biases.gz
sed '/^$/d' olig372.hic/hicpro/fithic.biases > olig372.hic/hicpro/fithic.biases.corrected
gzip olig372.hic/hicpro/fithic.biases.corrected
gunzip olig376.hic/hicpro/fithic.biases.gz
sed '/^$/d' olig376.hic/hicpro/fithic.biases > olig376.hic/hicpro/fithic.biases.corrected
gzip olig376.hic/hicpro/fithic.biases.corrected
mkdir gaba372.hic/hicpro/fithic/
mkdir gaba372.hic/hicpro/fithic/5kb
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic/fithic/fithic/fithic.py -i gaba372.hic/hicpro/fithic.interactionCounts.gz -f gaba372.hic/hicpro/fithic.fragmentMappability.gz -r 5000 -o gaba372.hic/hicpro/fithic/5kb/ -t gaba372.hic/hicpro/fithic.biases.corrected.gz --lib GABA372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 5000 --visual
mkdir gaba376.hic/hicpro/fithic/
mkdir gaba376.hic/hicpro/fithic/5kb
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic/fithic/fithic/fithic.py -i gaba376.hic/hicpro/fithic.interactionCounts.gz -f gaba376.hic/hicpro/fithic.fragmentMappability.gz -r 5000 -o gaba376.hic/hicpro/fithic/5kb/ -t gaba376.hic/hicpro/fithic.biases.corrected.gz --lib GABA376_fithic_python --passes 2 --upperbound 2000000 --lowerbound 5000 --visual
mkdir glu372.hic/hicpro/fithic/
mkdir glu372.hic/hicpro/fithic/5kb
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic/fithic/fithic/fithic.py -i glu372.hic/hicpro/fithic.interactionCounts.gz -f glu372.hic/hicpro/fithic.fragmentMappability.gz -r 5000 -o glu372.hic/hicpro/fithic/5kb/ -t glu372.hic/hicpro/fithic.biases.corrected.gz --lib GLU372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 5000 --visual
mkdir glu376.hic/hicpro/fithic/
mkdir glu376.hic/hicpro/fithic/5kb
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic/fithic/fithic/fithic.py -i glu376.hic/hicpro/fithic.interactionCounts.gz -f glu376.hic/hicpro/fithic.fragmentMappability.gz -r 5000 -o glu376.hic/hicpro/fithic/5kb/ -t glu376.hic/hicpro/fithic.biases.corrected.gz --lib GLU376_fithic_python --passes 2 --upperbound 2000000 --lowerbound 5000 --visual
mkdir olig372.hic/hicpro/fithic/
mkdir olig372.hic/hicpro/fithic/5kb
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic/fithic/fithic/fithic.py -i olig372.hic/hicpro/fithic.interactionCounts.gz -f olig372.hic/hicpro/fithic.fragmentMappability.gz -r 5000 -o olig372.hic/hicpro/fithic/5kb/ -t olig372.hic/hicpro/fithic.biases.corrected.gz --lib OLIG372_fithic_python --passes 2 --upperbound 2000000 --lowerbound 5000 --visual
mkdir olig376.hic/hicpro/fithic/
mkdir olig376.hic/hicpro/fithic/5kb
python3 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/fithic/fithic/fithic/fithic.py -i olig376.hic/hicpro/fithic.interactionCounts.gz -f olig376.hic/hicpro/fithic.fragmentMappability.gz -r 5000 -o olig376.hic/hicpro/fithic/5kb/ -t olig376.hic/hicpro/fithic.biases.corrected.gz --lib OLIG376_fithic_python --passes 2 --upperbound 2000000 --lowerbound 5000 --visual
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/fithic.sh
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/fithic/5kb
zcat GABA372_fithic_python.spline_pass2.res5000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA372_fithic_python.res5000.distance.txt # all interactions: total line= 25,873,891
awk '{if($10<=1000000 && $1==$3) print}' GABA372_fithic_python.res5000.distance.txt > GABA372_fithic_python.res5000.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=20,522,643
awk '{if($6<0.01) print}' GABA372_fithic_python.res5000.1Mbrange.txt > GABA372_fithic_python.res5000.1Mbrange.P0.01.txt # 572268
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/fithic/5kb
zcat GABA376_fithic_python.spline_pass2.res5000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GABA376_fithic_python.res5000.distance.txt # all interactions: total line= 27040889
awk '{if($10<=1000000 && $1==$3) print}' GABA376_fithic_python.res5000.distance.txt > GABA376_fithic_python.res5000.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=21487105
awk '{if($6<0.01) print}' GABA376_fithic_python.res5000.1Mbrange.txt > GABA376_fithic_python.res5000.1Mbrange.P0.01.txt # 857037
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic/hicpro/fithic/5kb
zcat GLU372_fithic_python.spline_pass2.res5000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GLU372_fithic_python.res5000.distance.txt # all interactions: total line= 92073981
awk '{if($10<=1000000 && $1==$3) print}' GLU372_fithic_python.res5000.distance.txt > GLU372_fithic_python.res5000.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=65528088
awk '{if($6<0.01) print}' GLU372_fithic_python.res5000.1Mbrange.txt > GLU372_fithic_python.res5000.1Mbrange.P0.01.txt # 2867958
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic/hicpro/fithic/5kb
zcat GLU376_fithic_python.spline_pass2.res5000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > GLU376_fithic_python.res5000.distance.txt # all interactions: total line= 36662088
awk '{if($10<=1000000 && $1==$3) print}' GLU376_fithic_python.res5000.distance.txt > GLU376_fithic_python.res5000.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=28990712
awk '{if($6<0.01) print}' GLU376_fithic_python.res5000.1Mbrange.txt > GLU376_fithic_python.res5000.1Mbrange.P0.01.txt # 1090289
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic/hicpro/fithic/5kb
zcat OLIG372_fithic_python.spline_pass2.res5000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > OLIG372_fithic_python.res5000.distance.txt # all interactions: total line= 81038743
awk '{if($10<=1000000 && $1==$3) print}' OLIG372_fithic_python.res5000.distance.txt > OLIG372_fithic_python.res5000.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=56578627
awk '{if($6<0.01) print}' OLIG372_fithic_python.res5000.1Mbrange.txt > OLIG372_fithic_python.res5000.1Mbrange.P0.01.txt # 2122062
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic/hicpro/fithic/5kb
zcat OLIG376_fithic_python.spline_pass2.res5000.significances.txt.gz | awk '{$10 = ($4-$2); print}' > OLIG376_fithic_python.res5000.distance.txt # all interactions: total line= 74918078
awk '{if($10<=1000000 && $1==$3) print}' OLIG376_fithic_python.res5000.distance.txt > OLIG376_fithic_python.res5000.1Mbrange.txt # select only for intrachromosomal interactions within 1Mb range: total line=52951134
awk '{if($6<0.01) print}' OLIG376_fithic_python.res5000.1Mbrange.txt > OLIG376_fithic_python.res5000.1Mbrange.P0.01.txt # 1900737
# scp 27n@brut-login01.ornl.gov:/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/matrix/GABA372/iced/20000/GABA372_20000_iced.matrix /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
# scp 27n@brut-login01.ornl.gov:/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/matrix/GABA372/raw/20000/GABA372_20000_abs.bed /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
# python HiCPlotter.py -f GABA372_20000_iced.matrix -o GABA372_20000_iced.out -r 20000 -tri 1 -bed GABA372_20000_abs.bed -n hES -wg 1 -chr chrX
#
# # if (!require("BiocManager", quietly = TRUE))
# # install.packages("BiocManager")
# #
# # BiocManager::install("HiTC")
# require(HiTC)
# ## Load Hi-C data
# x<-importC("GABA372_20000_iced.matrix", xgi.bed="GABA372_20000_abs.bed")
# show(x)
# ## Plot X intra-chromosomal map
# mapC(HTClist(x$NC_000001.11NC_000001.11), trim.range=.9)
#
# # create juicebox file
# # https://github.com/nservant/HiC-Pro/blob/master/bin/utils/hicpro2juicebox.sh
# # https://github.com/aidenlab/juicer/wiki/Installation
# module load anaconda3
# #conda create -p /lustre/or-scratch/cades-bsd/27n/envs/java python=3.8
# conda activate /lustre/or-scratch/cades-bsd/27n/envs/java
# #conda update -n base conda
# #conda install -c bioconda/label/cf201901 java-jdk
#################### run hicpro2juicebox.sh script to generate .hic file #########################
###### BRUT ######
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --ntasks-per-socket=2
#SBATCH --time=02:0:0
#SBATCH --mem-per-cpu=42G
#SBATCH -p brut_batch
#SBATCH --job-name=juicebox
#SBATCH --export=ALL
conda activate /lustre/or-scratch/cades-bsd/27n/envs/java
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results
#mkdir gaba372.juicer
#mkdir gaba372.juicer.temp
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/hicpro2juicebox.sh -i /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/data/GABA372/GABA372.allValidPairs -g /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/GRCh38.p13.chr.size -j /lustre/or-scratch/cades-bsd/27n/juicer/juicer_tools.jar -r /lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/annotation/hg38.arima.restriction.bed -t gaba372.juicer.temp -o gaba372.juicer
# sbatch /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/juicebox.file.sh
# scp 27n@brut-login01.ornl.gov:/lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/gaba372.juicer/GABA372.allValidPairs.hic .
#################### run hiccups with juicer to call loops #########################
###### BRUT ######
source activate /lustre/or-scratch/cades-bsd/27n/envs/java
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/hic_results/gaba372.juicer
java -jar /lustre/or-scratch/cades-bsd/27n/juicer/juicer_tools.1.9.9_jcuda.0.8.jar hiccups --threads 28 -r 20000 GABA372.allValidPairs.hic ./
# Reading file: GABA372.allValidPairs.hic
# Resolution 20000 not available.
# No valid configurations specified, using default settings
# Warning Hi-C map is too sparse to find many loops via HiCCUPS.
# Exiting. To disable sparsity check, use the --ignore_sparsity flag.
java -jar /lustre/or-scratch/cades-bsd/27n/juicer/juicer_tools.1.9.9_jcuda.0.8.jar hiccups --threads 28 -r 20000 --ignore_sparsity GABA372.allValidPairs.hic ./
# GPU/CUDA Installation Not Detected <-- Try on Andes instead
###### Andes ######
# scp GABA372.allValidPairs.hic noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/.
# salloc -A SYB105 -N 2 -t 4:00:00 -p gpu
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J hiccups
#SBATCH -N 2
#SBATCH -t 10:00:00
#SBATCH -p gpu
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate juicer
module load gcc/6.5.0 cuda
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/
java -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar hiccups --threads 28 -r 20000 --ignore_sparsity /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/GABA372.allValidPairs.hic ./
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/hiccups.sh
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/fdr_thresholds_5000 .
# scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/enriched_pixels_5000.bedpe .
## still issues with GPU/CUDA
nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Wed_Oct_23_19:24:38_PDT_2019
Cuda compilation tools, release 10.2, V10.2.89
###### detect_gpu.cu
# #include <stdio.h>
# int main() {
# int nDevices;
#
# cudaGetDeviceCount(&nDevices);
# printf("Number of devices found: %d\n", nDevices);
# for (int i = 0; i < nDevices; i++) {
# cudaDeviceProp prop;
# cudaGetDeviceProperties(&prop, i);
# printf("Device Number: %d\n", i);
# printf(" Device name: %s\n", prop.name);
# printf(" Memory Clock Rate (KHz): %d\n",
# prop.memoryClockRate);
# printf(" Memory Bus Width (bits): %d\n",
# prop.memoryBusWidth);
# printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
# 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
# }
# }
nvcc -o detect_gpu detect_gpu.cu
./detect_gpu
# Number of devices found: 4
# Device Number: 0
# Device name: Tesla K80
# Memory Clock Rate (KHz): 2505000
# Memory Bus Width (bits): 384
# Peak Memory Bandwidth (GB/s): 240.480000
#
# Device Number: 1
# Device name: Tesla K80
# Memory Clock Rate (KHz): 2505000
# Memory Bus Width (bits): 384
# Peak Memory Bandwidth (GB/s): 240.480000
#
# Device Number: 2
# Device name: Tesla K80
# Memory Clock Rate (KHz): 2505000
# Memory Bus Width (bits): 384
# Peak Memory Bandwidth (GB/s): 240.480000
#
# Device Number: 3
# Device name: Tesla K80
# Memory Clock Rate (KHz): 2505000
# Memory Bus Width (bits): 384
# Peak Memory Bandwidth (GB/s): 240.480000
### try on Summit
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/envs/summit-test
#conda install -c conda-forge grpc_java_plugin
module load cuda
module load gcc
#cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit
#git clone https://github.com/theaidenlab/juicer.git
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/
# usageHelp="Usage: ${0} [-h] -j <juicer_tools_file_path> -i <hic_file_path> -m <bed_file_dir> -g <genome ID>"
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/juicer/scripts/common
juicer_hiccups.sh -j juicer_tools.1.9.9_jcuda.0.8.jar -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/juicer/GABA372.allValidPairs.hic
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J juicebox
#SBATCH -N 1
#SBATCH -p batch
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd gaba372.hic/hicpro/hic_results
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2juicebox.sh -i data/GABA372/GABA372.allValidPairs -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//GRCh38.p13.chr.size -j /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar -r /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//hg38.arima.restriction.bed -t gaba372.juicer.temp -o gaba372.juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd gaba376.hic/hicpro/hic_results
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2juicebox.sh -i data/GABA376/GABA376.allValidPairs -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//GRCh38.p13.chr.size -j /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar -r /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//hg38.arima.restriction.bed -t gaba376.juicer.temp -o gaba376.juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd glu372.hic/hicpro/hic_results
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2juicebox.sh -i data/GLU372/GLU372.allValidPairs -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//GRCh38.p13.chr.size -j /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar -r /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//hg38.arima.restriction.bed -t glu372.juicer.temp -o glu372.juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd glu376.hic/hicpro/hic_results
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2juicebox.sh -i data/GLU376/GLU376.allValidPairs -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//GRCh38.p13.chr.size -j /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar -r /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//hg38.arima.restriction.bed -t glu376.juicer.temp -o glu376.juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd olig372.hic/hicpro/hic_results
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2juicebox.sh -i data/OLIG372/OLIG372.allValidPairs -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//GRCh38.p13.chr.size -j /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar -r /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//hg38.arima.restriction.bed -t olig372.juicer.temp -o olig372.juicer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
cd olig376.hic/hicpro/hic_results
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/bin/utils/hicpro2juicebox.sh -i data/OLIG376/OLIG376.allValidPairs -g /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//GRCh38.p13.chr.size -j /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar -r /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation//hg38.arima.restriction.bed -t olig376.juicer.temp -o olig376.juicer
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/juicebox.hic.sh
–> use juicer_tools to convert to .hic file
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results
java -Xmx1G -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/juicer/scripts/juicer_tools.1.9.9_jcuda.0.8.jar pre data/GABA372/GABA372.allValidPairs gaba372.hic /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro/HiC-Pro/annotation/GRCh38.p13.chr.size
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
git clone https://github.com/ay-lab/mustache
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda env create -f ./mustache/environment.yml
conda activate mustache
# python3 mustache/mustache/mustache.py -f ./mustache/data/chr21_5kb.RAWobserved -b ./mustache/data/chr21_5kb.KRnorm -ch 21 -r 5kb -o chr21_out5.tsv -pt 0.1 -st 0.8
# default chr = all, p-val threshold = 0.1, sparsity threshold = 0.88
## run with .hic file
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer
# f=input file, ch=chromosome, r=resolution, o=output file
conda activate juicer
conda env update --file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mustache/environment.juicer.yml
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mustache/mustache/mustache.py -f GABA372.allValidPairs.hic -r 1kb -pt 0.01 -o GABA372.allValidPairs.hic.mustache.tsv
# ModuleNotFoundError: No module named 'hicstraw'
## pip install cooler; pip install hic-straw
# run mustache with .hic file
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mustache/mustache/mustache.py -f GABA372.allValidPairs.hic -r 20kb -pt 0.01 -ch NC_000001.11 -o GABA372.hic.mustache.tsv
# The distance limit is set to 4000000bp
# Reading contact map...
# 0 40000000
# File did not contain KR normalization vectors for one or both chromosomes at 20000 BP
# Error finding block data
# 36000000 76000000
# File did not contain KR normalization vectors for one or both chromosomes at 20000 BP
# Killed
# run with same input as fithic (**need to transfer proper files to andes...)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate mustache
pip install hic-straw==1.3.0
pip install cooler==0.8.7
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mustache/mustache/mustache.py -f fithic.interactionCounts -b fithic.biases.corrected -r 20kb -pt 0.1 -ch NC_000001.11 -o GABA372.hic.mustache.chr1.tsv
# 376 loops found for chrmosome=NC_000001.11, fdr<0.1 in 57.29sec
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mustache/mustache/mustache.py -f fithic.interactionCounts -b fithic.biases.corrected -r 20kb -pt 0.1 -ch NC_000008.11 -o GABA372.hic.mustache.chr8.tsv
# 253 loops found for chrmosome=NC_000008.11, fdr<0.1 in 47.90sec
# NC_000008.11 BestRefSeq gene 142611049 142614479 . - . gene_id "ARC"; transcript_id ""; db_xref "GeneID:23237"; db_xref "HGNC:HGNC:648"; db_xref "MIM:612461"; description "activity regulated cytoskeleton associated protein"; gbkey "Gene"; gene "ARC"; gene_biotype "protein_coding"; gene_synonym "Arg3.1"; gene_synonym "hArc";
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/mustache/mustache/mustache.py -f fithic.interactionCounts -b fithic.biases.corrected -r 20kb -pt 0.1 -ch NC_000011.10 -o GABA372.hic.mustache.chr11.tsv
# 183 loops found for chrmosome=NC_000011.10, fdr<0.1 in 48.39sec
# NC_000011.10 BestRefSeq%2CGnomon gene 66,408,922 66426707 . + . gene_id "NPAS4"; transcript_id ""; db_xref "GeneID:266743"; db_xref "HGNC:HGNC:18983"; db_xref "MIM:608554"; description "neuronal PAS domain protein 4"; gbkey "Gene"; gene "NPAS4"; gene_biotype "protein_coding"; gene_synonym "bHLHe79"; gene_synonym "Le-PAS"; gene_synonym "NXF"; gene_synonym "PASD10";
# ARC
# NC_000008.11 BestRefSeq gene 142611049 142614479 . - . gene_id "ARC"; transcript_id ""; db_xref "GeneID:23237"; db_xref "HGNC:HGNC:648"; db_xref "MIM:612461"; description "activity regulated cytoskeleton associated protein"; gbkey "Gene"; gene "ARC"; gene_biotype "protein_coding"; gene_synonym "Arg3.1"; gene_synonym "hArc";
# NC_000008.11:142611049-NC_000008.11:142614479
142,611,049
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
R
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/fithic")
df <- read.delim("GABA372_fithic_python.1Mbrange.P0.01.txt", header=F, sep=" ")
arc.fragment1 <- subset(df, df$V1 == "NC_000008.11" & df$V2 >= 142591049 & df$V2 <= 142631049)
arc.fragment2 <- subset(df, df$V1 == "NC_000008.11" & df$V4 >= 142591049 & df$V4 <= 142631049)
# ARC is on the - strand so fragment1 means that the interaction is with ARC gene and upstream, fragment2 is ARC gene and downstream
# make a view-able fithic file... chr, start, end, chr, start, end
library(dplyr)
df.view <- df %>% mutate(BIN1_CHR = df$V1, BIN1_START = df$V2 - 20000, BIN1_END = df$V2 + 20000, BIN2_CHR = df$V3, BIN2_START = df$V4 - 20000, BIN2_END = df$V4 + 20000)
write.table(df.view[,11:16], "GABA372_fithic_python.1Mbrange.P0.01.tsv", quote=F, row.names=F, sep="\t")
## Convert into TADs caller input from Dixon et al.
# HICPRO_PATH/bin/utils/sparseToDense.py -b hic_results/matrix/dixon_2M/raw/1000000/dixon_2M_1000000_abs.bed hic_results/matrix/dixon_2M/iced/1000000/dixon_2M_1000000_iced.matrix --perchr --di
source activate /lustre/or-scratch/cades-bsd/27n/envs/HiCPro
cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro
/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/sparseToDense.py -b hic_results/matrix/GABA372/raw/20000/GABA372_20000_abs.bed hic_results/matrix/GABA372/iced/20000/GABA372_20000_iced.matrix --perchr --di
# /lustre/or-scratch/cades-bsd/27n/envs/HiCPro/lib/python3.8/site-packages/iced/normalization/_ca_utils.py:8: UserWarning: The API of this module is likely to change. Use only for testing purposes
# warnings.warn(
# Traceback (most recent call last):
# File "/lustre/or-scratch/cades-bsd/27n/HiC-Pro_3.1.0/bin/utils/sparseToDense.py", line 92, in <module>
# counts = io.load_counts(args.filename, lengths=lengths)
# File "/lustre/or-scratch/cades-bsd/27n/envs/HiCPro/lib/python3.8/site-packages/iced/io/_io_pandas.py", line 34, in load_counts
# dataframe = pd.read_csv(filename, sep="\t", comment="#", header=None)
# AttributeError: module 'pandas' has no attribute 'read_csv'
http://homer.ucsd.edu/homer/interactions2/HiCTADsAndLoops.html https://qcb.ucla.edu/wp-content/uploads/sites/14/2017/02/Workshop-10-HiC-D3.pdf
#cd /lustre/or-scratch/cades-bsd/27n/homer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer
wget -O gap.txt.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/gap.txt.gz
wget -O dups.txt.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/genomicSuperDups.txt.gz
zcat gap.txt.gz dups.txt.gz | cut -f2-4 > badRegions.bed
git clone https://github.com/StevenWingett/HiCUP.git
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/HiCPro
# convert HiC-Pro output to homer-compatible file
#cd /lustre/or-scratch/cades-bsd/27n/gaba372.hic/hicpro/bowtie_results/bwt2/GABA372
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/bowtie_results/bwt2/GABA372
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/HiCUP/Conversion/hicup2homer GABA372_AGTCAA_HWJW2DSXY_L001_001_GRCh38.p13.bt2.bwt2pairs.bam
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/HiCUP/Conversion/hicup2homer GABA372_AGTCAA_HWJW2DSXY_L002_001_GRCh38.p13.bt2.bwt2pairs.bam
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/HiCUP/Conversion/hicup2homer GABA372_AGTCAA_HWJW2DSXY_L003_001_GRCh38.p13.bt2.bwt2pairs.bam
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/HiCUP/Conversion/hicup2homer GABA372_AGTCAA_HWJW2DSXY_L004_001_GRCh38.p13.bt2.bwt2pairs.bam
# load homer --> need to do this on andes due to no gpu on brut...
#cd /lustre/or-scratch/cades-bsd/27n/homer
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer
wget http://homer.ucsd.edu/homer/configureHomer.pl
perl configureHomer.pl -install
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/bowtie_results/bwt2/GABA372
# make Tag Directory
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/makeTagDirectory tag1/ -format HiCsummary GABA372_AGTCAA_HWJW2DSXY_L001_001_GRCh38.p13.bt2.bwt2pairs.bam.homer
# Will parse file: GABA372_AGTCAA_HWJW2DSXY_L001_001_GRCh38.p13.bt2.bwt2pairs.bam.homer
#
# Creating directory: tag/ and removing existing *.tags.tsv
#
# Reading alignment file GABA372_AGTCAA_HWJW2DSXY_L001_001_GRCh38.p13.bt2.bwt2pairs.bam.homer
#
# Optimizing tag files...
# Estimated genome size = 3222542202
# Estimated average read density = 0.016767 per bp
# Total Tags = 54033778.0
# Total Positions = 49692234
# Average tag length = 1.0
# Median tags per position = 1 (ideal: 1)
# Average tags per position = 1.087
# Local interaction fraction (< 1kb): 15.00%
# Interchromosomal interaction fraction: 17.47%
# Fragment Length Estimate: 220
# Peak Width Estimate: 151
# !!! No reliable estimate for peak size
# Setting Peak width estimate to be equal to fragment length estimate
# Autocorrelation quality control metrics:
# Same strand fold enrichment: 1.2
# Diff strand fold enrichment: 1.1
# Same / Diff fold enrichment: 1.1
#
# Guessing sample is ChIP-Seq - may have low enrichment with lots of background
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/makeTagDirectory tag2/ -format HiCsummary GABA372_AGTCAA_HWJW2DSXY_L002_001_GRCh38.p13.bt2.bwt2pairs.bam.homer
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/makeTagDirectory tag3/ -format HiCsummary GABA372_AGTCAA_HWJW2DSXY_L003_001_GRCh38.p13.bt2.bwt2pairs.bam.homer
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/makeTagDirectory tag4/ -format HiCsummary GABA372_AGTCAA_HWJW2DSXY_L004_001_GRCh38.p13.bt2.bwt2pairs.bam.homer
# create background model
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag1/ -res 20000 -bgonly
# Genome Size=3221493853.0
# No Hi-C background model found for 20000 bp resolution. Creating...
# Generating Background using -fullModel
# Genome Size=3221493853.0
# Calculating PE Tag Coverage:................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
# Avg interactions per peak = 308.0 +/- 163.1 (354.2)
#
# Finding Interactions to average into expected profile (-fullModel)...
# NC_000001.11 (avg coverage: 377.2, coverage ratio=inf)
# NC_000002.12 (avg coverage: 368.5, coverage ratio=inf)
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag2/ -res 20000 -bgonly
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag3/ -res 20000 -bgonly
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag4/ -res 20000 -bgonly
# create normalized interaction matrix for the whole genome
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag1/ -res 20000 -norm -override -nomatrix > tag1/GABA372_AGTCAA_HWJW2DSXY_L001_whole_genome_matrix.txt
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag2/ -res 20000 -norm -override -nomatrix > tag2/GABA372_AGTCAA_HWJW2DSXY_L002_whole_genome_matrix.txt
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag3/ -res 20000 -norm -override -nomatrix > tag3/GABA372_AGTCAA_HWJW2DSXY_L003_whole_genome_matrix.txt
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag4/ -res 20000 -norm -override -nomatrix > tag4/GABA372_AGTCAA_HWJW2DSXY_L004_whole_genome_matrix.txt
# run PCA
#runHiCpca.pl test_pca tag/ -res 20000 -superRes 20000 -genome hg19 -corrDepth 1
# visualize with HiCPlotter
#python /u/home/galaxy/collaboratory/apps/HiCPlotter/HiCPlotter.py -f rawdata_20000.matrix -bed rawdata_20000_abs.bed -n raw -chr chr8 -o raw_chr8 -tri 1 -r 20000 -hmc 5 -mm 10 -ptr 1 -pcd 1 -pcdf hESC_domains_hg19.bed -hist pca_fl4_500k.PC1.bedGraph -hl pca -hm 30
# identify significant interactions
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/bowtie_results/bwt2/GABA372
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag1/ -res 20000 -superRes 1000000 -interactions GABA372_AGTCAA_HWJW2DSXY_L001_significantInteractions.txt -nomatrix
# Genome Size=3218638470.0
# Total Hi-C interactions: 25020761
# Calculating PE Tag Coverage:..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
#
# Average interaction count in regions = 14340.5 +/- 6439.9 (16018.7 without outliers)
# NC_000001.11 (avg coverage: 16943.5, coverage ratio=1.090)
# NC_000002.12 (avg coverage: 16810.3, coverage ratio=1.082)
# NC_000003.12 (avg coverage: 16637.0, coverage ratio=1.071)
# NC_000004.12 (avg coverage: 15465.7, coverage ratio=0.995)
# Regions with too many or too few reads: 6.61%
# Total Significant Interactions: 1085
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag2/ -res 20000 -superRes 1000000 -interactions GABA372_AGTCAA_HWJW2DSXY_L002_significantInteractions.txt -nomatrix
# Genome Size=3221493853.0
# Total Hi-C interactions: 27016889
# Calculating PE Tag Coverage:................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
#
# Average interaction count in regions = 15350.3 +/- 6928.0 (17163.6 without outliers)
# NC_000001.11 (avg coverage: 18164.8, coverage ratio=1.083)
# NC_000002.12 (avg coverage: 17998.2, coverage ratio=1.073)
# NC_000003.12 (avg coverage: 17804.1, coverage ratio=1.062)
# NC_000004.12 (avg coverage: 16537.1, coverage ratio=0.986)
# Regions with too many or too few reads: 6.69%
# Total Significant Interactions: 1095
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag3/ -res 20000 -superRes 1000000 -interactions GABA372_AGTCAA_HWJW2DSXY_L003_significantInteractions.txt -nomatrix
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/analyzeHiC tag4/ -res 20000 -superRes 1000000 -interactions GABA372_AGTCAA_HWJW2DSXY_L004_significantInteractions.txt -nomatrix
# find TADs and loops
cp GABA372_AGTCAA_HWJW2DSXY_L001_significantInteractions.txt tag1/.
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/findTADsAndLoops.pl find tag1/ -cpu 10 -res 20000 -window 15000 -genome hg38 -p badRegions.bed -o GABA372_AGTCAA_HWJW2DSXY_L001
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/findTADsAndLoops.pl find tag2/ -cpu 10 -res 20000 -window 15000 -genome hg38 -p badRegions.bed
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/findTADsAndLoops.pl find tag3/ -cpu 10 -res 20000 -window 15000 -genome hg38 -p badRegions.bed
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/findTADsAndLoops.pl find tag4/ -cpu 10 -res 20000 -window 15000 -genome hg38 -p badRegions.bed
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/merge2Dbed.pl exp1r1.loop.2D.bed exp1r2.loop.2D.bed exp21.loop.2D.bed exp2r2.loop.2D.bed -loop > merged.loop.2D.bed
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/merge2Dbed.pl exp1r1.tad.2D.bed exp1r2.tad.2D.bed exp21.tad.2D.bed exp2r2.tad.2D.bed -tad > merged.tad.2D.bed
# score TADs and loops
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/homer/bin/findTADsAndLoops.pl score -tad merged.tad.2D.bed -loop merged.loop.2D.bed -d tag1/ tag2/ tag3/ tag4/ -cpu 10 -o output
https://nf-co.re/hic/usage https://zhonglab.gitbook.io/3dgenome/chapter2-computational-analysis/3.2-higer-order-data-analysis/analytical-pipelines/hic-pro-pipeline ##HiC-Pro https://www.bioinformatics.babraham.ac.uk/projects/hicup/read_the_docs/html/index.html ## HiCCUP http://bioinformatics.age.mpg.de/presentations-tutorials/presentations/modules/hiC/HiC-tutorial.html ## HiTC https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4347522/ ## understanding HiC output https://link.springer.com/article/10.1007/s12551-018-0489-1#Sec4 ## downstream analyses (compartments-HiTC, TADs, interactions) https://www.frontiersin.org/articles/10.3389/fgene.2019.01079/full ##HiCeekR
“Most tools presented in this review store data in different formats, and only few provide utilities to convert from one format to another.” - Since Hi-C matrices are symmetric and sparse, a more efficient format is the “sparse” format where only nonzero entries of half of the matrix are reported as “row column value” triplets. This format is also called coordinated list or COO and is used by HiC-Pro - In the case of high-resolution matrices, using these formats can produce files that are large and difficult to manage. To overcome this problem, matrices can be saved using highly compressed binary formats: the “.cool” format is based on HDF5 and is used by the cooler pipeline; the “.hic” format is used instead by the Juicer pipeline
http://www.vaquerizaslab.org/wp-content/uploads/2019/10/Ing-Simmons_Vaquerizas-Development_2019-dev177162.full_.pdf https://github.com/lucidif/HiCeekR
install.packages("devtools")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
library(devtools) ; install_github("HenrikBengtsson/TopDom") ; install_github("lucidif/HiCeekR", repos=BiocManager::repositories())
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes")
library(HiCeekR)
HiCeekR()
# Error in utils::browseURL(appUrl) :
# 'browser' must be a non-empty character string
# create configuration files (HCR.config and HCRwd.info)
# define path as /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output
–> for visualizing data in WashU Epigenome Browser (http://epigenomegateway.wustl.edu/browser/) need to take gInteraction files and convert chromosome names and file format [chr1 713605 715737 chr1:720589-722848,2]
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/fithic.interactionCounts.gz /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
gunzip fithic.interactionCounts.gz
sed 's/NC_000010.11/chr10/g' fithic.interactionCounts | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' > fithic.interactionCounts.chr.txt
awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' fithic.interactionCounts.chr.txt > fithic.interactionCounts.chr.wustl.txt
### file too large... subset to intrachromosomal interactions that are less then 1Mb apart
awk '{if ($1 == $3 && ($2-$4 < 1000000 || $4-$2 < 1000000)) print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' fithic.interactionCounts.chr.txt > fithic.interactionCounts.chr.intra.1Mb.wustl.txt
### or use fithic output file
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/fithic/5kb
sed 's/NC_000010.11/chr10/g' GABA372_fithic_python.res5000.1Mbrange.P0.01.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' > GABA372_fithic_python.res5000.1Mbrange.P0.01.wustl.txt
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/fithic/5kb/GABA372_fithic_python.res5000.1Mbrange.P0.01.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/fithic/5kb
sed 's/NC_000010.11/chr10/g' GABA376_fithic_python.res5000.1Mbrange.P0.01.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' > GABA376_fithic_python.res5000.1Mbrange.P0.01.wustl.txt
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/fithic/5kb/GABA376_fithic_python.res5000.1Mbrange.P0.01.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
### how many of the sig interactions are consistent between GABA372 and GABA376???
df.1 <- read.delim("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/fithic/5kb/GABA372_fithic_python.res5000.1Mbrange.P0.01.txt", header=F, sep=" ", stringsAsFactors = F)
df.2 <- read.delim("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/fithic/5kb/GABA376_fithic_python.res5000.1Mbrange.P0.01.txt", header=F, sep=" ", stringsAsFactors = F)
nrow(df.1) # 572,268
nrow(df.2) # 857,037
df.1$id <- seq.int(nrow(df.1))
df.2$id <- seq.int(nrow(df.2))
library(tidygenomics)
df.1$start <- df.1$V2-2500
df.1$end <- df.1$V2+2500
df.2$start <- df.2$V2-2500
df.2$end <- df.2$V2+2500
df.1.2.first <- genome_intersect(df.1, df.2, by=c("V1", "start", "end"), mode="both")
nrow(df.1.2.first) # 4,842,018
df.1$start <- df.1$V4-2500
df.1$end <- df.1$V4+2500
df.2$start <- df.2$V4-2500
df.2$end <- df.2$V4+2500
df.1.2.second <- genome_intersect(df.1, df.2, by=c("V3", "start", "end"), mode="both")
nrow(df.1.2.second) # 4,863,250
library(dplyr)
df.1.2.all <- inner_join(df.1.2.first, df.1.2.second, by=c("id.x", "id.y"))
nrow(df.1.2.all) # 194,314
df.1.2.all$mean.score <- (df.1.2.all$V5.x.x + df.1.2.all$V5.x.y) / 2
write.table(df.1.2.all[,c(1,22,23,25,44,45,46)], "fithic_python.res5000.1Mbrange.P0.01_intersectGABA372GABA376.txt", quote=F, col.names=F, row.names=F, sep="\t")
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/fithic/5kb/
sed 's/NC_000010.11/chr10/g' fithic_python.res5000.1Mbrange.P0.01_intersectGABA372GABA376.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$3+2500"\t"$4":"$5-2500"-"$6+2500","$7)}' > fithic_python.res5000.1Mbrange.P0.01_intersectGABA372GABA376.wustl.txt
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba376.hic/hicpro/fithic/5kb/fithic_python.res5000.1Mbrange.P0.01_intersectGABA372GABA376.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
# transfer all cell types to view in Wash U Epigenome browser
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic/hicpro/fithic/5kb
sed 's/NC_000010.11/chr10/g' GLU372_fithic_python.res5000.1Mbrange.P0.01.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' > GLU372_fithic_python.res5000.1Mbrange.P0.01.wustl.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic/hicpro/fithic/5kb
sed 's/NC_000010.11/chr10/g' GLU376_fithic_python.res5000.1Mbrange.P0.01.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' > GLU376_fithic_python.res5000.1Mbrange.P0.01.wustl.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic/hicpro/fithic/5kb
sed 's/NC_000010.11/chr10/g' OLIG372_fithic_python.res5000.1Mbrange.P0.01.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' > OLIG372_fithic_python.res5000.1Mbrange.P0.01.wustl.txt
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic/hicpro/fithic/5kb
sed 's/NC_000010.11/chr10/g' OLIG376_fithic_python.res5000.1Mbrange.P0.01.txt | sed 's/NC_000011.10/chr11/g' | sed 's/NC_000012.12/chr12/g' | sed 's/NC_000013.11/chr13/g' | sed 's/NC_000014.9/chr14/g' | sed 's/NC_000015.10/chr15/g' | sed 's/NC_000016.10/chr16/g' | sed 's/NC_000017.11/chr17/g' | sed 's/NC_000018.10/chr18/g' | sed 's/NC_000019.10/chr19/g' | sed 's/NC_000020.11/chr20/g' | sed 's/NC_000021.9/chr21/g' | sed 's/NC_000022.11/chr22/g' | sed 's/NC_000001.11/chr1/g' | sed 's/NC_000002.12/chr2/g' | sed 's/NC_000003.12/chr3/g' | sed 's/NC_000004.12/chr4/g' | sed 's/NC_000005.10/chr5/g' | sed 's/NC_000006.12/chr6/g' | sed 's/NC_000007.14/chr7/g' | sed 's/NC_000008.11/chr8/g' | sed 's/NC_000009.12/chr9/g' | sed 's/NC_000023.11/chrX/g' | sed 's/NC_000024.10/chrY/g' | awk '{print ($1"\t"$2-2500"\t"$2+2500"\t"$3":"$4-2500"-"$4+2500","$5)}' > OLIG376_fithic_python.res5000.1Mbrange.P0.01.wustl.txt
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu372.hic/hicpro/fithic/5kb/GLU372_fithic_python.res5000.1Mbrange.P0.01.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/glu376.hic/hicpro/fithic/5kb/GLU376_fithic_python.res5000.1Mbrange.P0.01.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig372.hic/hicpro/fithic/5kb/OLIG372_fithic_python.res5000.1Mbrange.P0.01.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
scp noshayjm@dtn.ccs.ornl.gov:/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/olig376.hic/hicpro/fithic/5kb/OLIG376_fithic_python.res5000.1Mbrange.P0.01.wustl.txt /Users/27n/Dropbox (ORNL)/ORNL.Noshay/Projects/Sprint.Opioid.ATAC/HiC.Pro.Output/.
7 June 2022 - HiC Data integration
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/
git clone https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction.git
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
## pre-run macs peak files: /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/peaks
# call peaks
# conda env create --name abc.macs -f macs.yml
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
macs2 callpeak \
-t /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
-n gaba.rmdups.uniq.merged.macs2 \
-f BAM \
-g hs \
-p .1 \
--call-summits \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
# WARNING @ Wed, 08 Dec 2021 13:27:47: Too few paired peaks (33) so I can not build the model! Broader your MFOLD range parameter may erase this error. If it still can't build the model, we suggest to use --nomodel and --extsize 147 or other fixed number instead.
# WARNING @ Wed, 08 Dec 2021 13:27:47: Process for pairing-model is terminated!
macs2 callpeak \
-t /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
-n gaba.rmdups.uniq.merged.macs2 \
-f BAM \
-g hs \
-p .1 \
--nomodel \
--extsize 147 \
--call-summits \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged
# sort narrow peaks file
bedtools sort -faidx /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size.bed -i /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted
# Step 1: call candidate regions
# make TSS bed file
R
library(dplyr)
library(tidyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/")
df <- read.delim("GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.gtf", header=F, sep="\t")
df.tss <- df %>% mutate(chr = V1, start = ifelse(V7 == "+", V4 - 250, V5 - 250), end = ifelse(V7 == "+", V4 + 250, V5 + 250))
df.tss.sub <- df.tss[,c(10:12,9,7)]
df.id <- separate(df.tss.sub, V9, c("gene_id"), sep=";")
df.id2 <- separate(df.id, gene_id, c("type", "gid"), sep=" ")
df.print <- df.id2[,c(1:3,5,6)]
write.table(df.print, "GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.TSS500bp.bed", quote=F, row.names=F, col.names=F, sep="\t")
df.id <- separate(df, V9, c("gene_id"), sep=";")
df.id2 <- separate(df.id, gene_id, c("type", "gid"), sep=" ")
df.id2$zero <- 0
df.print <- df.id2[,c(1,4,5,10,11,7)]
write.table(df.print, "GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.collabsed.bed", quote=F, row.names=F, col.names=F, sep="\t")
df.genelist <- data.frame(df.id2[,10])
write.table(df.genelist, "GCF_000001405.39_GRCh38.p13_genomic.genelist.bed", quote=F, row.names=F, col.names=F, sep="\t")
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
# conda env create -f /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction/abcenv.yml
# conda activate final-abc-env
# conda env list
# conda create --name ABC python=3.6.4
conda activate ABC
# conda install -c conda-forge -c bioconda samtools bedtools Tabix MACS2 Java Juicer
# conda install -c conda-forge -c bioconda pyranges numpy pandas scipy pysam pyBigWig
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction
samtools index /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bai
python src/makeCandidateRegions.py \
--narrowPeak /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted \
--bam /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.bam \
--outDir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/ \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--regions_blocklist /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/hg38-blacklist.v2.ensembl.bed \
--regions_includelist /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.TSS500bp.bed \
--peakExtendFromSummit 250 \
--nStrongestPeaks 150000
# Step 2: quantifying enhancer activity
## TPM files here: /gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only
R
library(dplyr)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/atown/Projects/MtSinai/edgeR/Exons_only")
df.gaba <- read.delim("Gabba_heroin_vs_control_averageTPM.csv", header=T, sep=",", stringsAsFactors = F)
df.gaba.mean <- df.gaba %>% mutate(mean = (df.gaba$Control + df.gaba$Heroin) / 2)
df.glu <- read.delim("Glu_heroin_vs_control_averageTPM.csv", header=T, sep=",", stringsAsFactors = F)
df.glu.mean <- df.glu %>% mutate(mean = (df.glu$Control + df.glu$Heroin) / 2)
df.olig <- read.delim("Olig_heroin_vs_control_averageTPM.csv", header=T, sep=",", stringsAsFactors = F)
df.olig.mean <- df.olig %>% mutate(mean = (df.olig$Control + df.olig$Heroin) / 2)
setwd("/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/")
write.table(df.gaba.mean[,c(1,4)], "Gaba_ABC_meanTPM.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(df.glu.mean[,c(1,4)], "Glu_ABC_meanTPM.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(df.olig.mean[,c(1,4)], "Olig_ABC_meanTPM.txt", quote=F, row.names=F, col.names=F, sep="\t")
df.gaba.const <- subset(df.gaba, df.gaba$Control >= 1 & df.gaba$Heroin >= 1)
df.glu.const <- subset(df.glu, df.glu$Control >= 1 & df.glu$Heroin >= 1)
df.olig.const <- subset(df.olig, df.olig$Control >= 1 & df.olig$Heroin >= 1)
write.table(data.frame(df.gaba.const[,1]), "Gaba_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(data.frame(df.glu.const[,1]), "Glu_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
write.table(data.frame(df.olig.const[,1]), "Olig_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
df.gaba.glu <- inner_join(df.gaba.const, df.glu.const, by="X")
df.gaba.glu.olig <- inner_join(df.gaba.glu, df.olig.const, by="X")
write.table(data.frame(df.gaba.glu.olig[,1]), "All_ABC_constitutive.txt", quote=F, row.names=F, col.names=F, sep="\t")
## merge chipseq bam files
cd /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_BAM/
# ls *.bam > files.gaba.bamlist
bamtools merge -list files.gaba.bamlist -out mrege.SOX.sorted.bam
# Step 3: computing ABC score
## need to get HiC data in proper format (juicer or bedpe???)
## https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction
### convert hicpro output to bedpe format [https://rdrr.io/bioc/HiCcompare/man/hicpro2bedpe.html]
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/data/GABA372
awk '{print ($2"\t"$3-5000"\t"$3+5000"\t"$5"\t"$6-5000"\t"$6+5000"\t"".""\t"".""\t"$4"\t"$7)}' GABA372.allValidPairs > GABA372.allValidPairs.bedpe # Needs to be a separate file for each chromosome...
awk -F'\t' '{print>$1".bedpe"}' GABA372.allValidPairs.bedpe # print>$1 prints the current line to a file whose name is the first field
gzip *.bedpe
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/data/GABA372 && # then put each of the files in their own directory
for i in ./N*.bedpe.gz
do
d=$(basename "$i" .bedpe.gz)
mkdir "$d" && mv "$i" "$d"
done
### juicebox:
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/ABC
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/ABC
# # git clone https://github.com/broadinstitute/ABC-Enhancer-Gene-Prediction.git
# #Download hic matrix file from juicebox
python ABC-Enhancer-Gene-Prediction/src/juicebox_dump.py \
--hic_file /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer/GABA372.allValidPairs.hic \
--juicebox "java -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/juicer/juicer/scripts/common/juicer_tools.1.9.9_jcuda.0.8.jar" \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer/ ### this requires the .KR files????
# # Fit HiC data to powerlaw model and extract parameters
# python ABC-Enhancer-Gene-Prediction/src/compute_powerlaw_fit_from_hic.py \
# --hicDir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer/ \
# --outDir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer/powerlaw/ \
# --maxWindow 1000000 \
# --minWindow 5000 \
# --resolution 5000
#### convert .hic to bedpe
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/ABC
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/ABC
python ABC-Enhancer-Gene-Prediction/src/make_bedgraph_from_HiC.py \
--hic_dir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/gaba372.juicer/ \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/ \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.collabsed.bed \
-- resolution 5000
# ##### HiC data formats are so convoluted...
# # https://hicexplorer.readthedocs.io/en/latest/content/tools/hicConvertFormat.html
# # convert hicpro output to cool
conda install hicexplorer -c bioconda -c conda-forge
hicConvertFormat -m matrix.hicpro --bedFileHicpro hicpro.bed --inputFormat hicpro --outputFormat cool -o matrix.cool
###### HiCcompare R package
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("HiCcompare")
## Need to make .cool file
#!/bin/bash
#SBATCH -A SYB105
#SBATCH -J abc.model
#SBATCH -N 1
#SBATCH -p gpu
#SBATCH -t 24:00:00
#SBATCH --mem-per-cpu=0
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/anaconda3/envs/ABC
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction
# Step 2: quantifying enhancer activity
python src/run.neighborhoods.py \
--candidate_enhancer_regions /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/gaba.rmdups.uniq.merged.macs2_peaks.narrowPeak.sorted.candidateRegions.bed \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GCF_000001405.39_GRCh38.p13_genomic.gene.sorted.collabsed.bed \
--H3K27ac /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/H3K27Ac_ChIPseq/GABA/Mt_Sinai_BAM/mrege.SOX.sorted.bam \
--DHS /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/rmdups.uniq.bam/rmdups.uniq.gaba.merged.sorted.bam \
--expression_table /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/Gaba_ABC_meanTPM.txt \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--ubiquitously_expressed_genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/bwa.output/macs.output/rmdups.uniq.merged/All_ABC_constitutive.txt \
--cellType all \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/abc.output/
# Step 3: computing ABC score
python src/predict.py \
--enhancers /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/abc.output/EnhancerList.txt \
--genes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/abc.output/GeneList.txt \
--HiCdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/brut.hicpro.output/gaba372.hic/hicpro/hic_results/data/GABA372/ \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--hic_type bedpe \
--hic_resolution 5000 \
--scale_hic_using_powerlaw \
--threshold .02 \
--cellType GABA \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/abc.output/ \
--make_all_putative
# Step 4: get prediction files for variant overlap
python src/getVariantOverlap.py \
--all_putative /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/abc.output/EnhancerPredictionsAllPutative.txt.gz \
--chrom_sizes /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ref/GRCh38.p13.size \
--outdir /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/abc.output/
# sbatch /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.atac/ABC-Enhancer-Gene-Prediction/run.sh
heroin v control
Processing… Merge fastq files from 2 replicate lanes HTStream package BWA-MEM samtools MACS2
similar number of peaks
PC separates by cell type
18 samples per cell type (heroin and control together)
consensus peak sets with custom script to look for overlap at position (overlapping replicates = 3)
109,758 gaba
114,296 glu
94,206 olig
gaba PCA plots…
glu shows similar results but less heroin/ctrl separation for females based on first plot
olig is less clear for heroin/ctrl
differential acetylation (DA) peaks
DEseq2, FC>1.5, FDR<0.05
model: ~ age + sex + condition
gaba: more up-regulated in heroin than down
glu: same
olig: same
more concordance with RNAseq for down-regulated genes than up-regulated genes?
assigning peaks to genes… GRATE (1Mb from TSS, can be assigned to multiple genes)
Key points:
“One would expect a notable overlap between H3K27ac and open chromatin, but open chromatin is a very focal signal, usually < 1kb per peak while H3K27ac can well stretch several kilobases.”
“Elements with strong DNase-seq signal but no H3K27ac ChIP–seq signal might be CTCF-bound topological elements, and elements with strong H3K27ac signal but no DNase-seq signal might be sequences that are close to strong enhancers, but do not per se have enhancer activity due to the spreading H3K27ac signal over hundreds to thousands of base pairs” From https://www.nature.com/articles/s41588-019-0538-0
directory: /gpfs/alpine/syb105/proj-shared/Data/NCBI-Mirror/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/ gtf file: GCF_000001405.39_GRCh38.p13_genomic.gtf fasta file: GCF_000001405.39_GRCh38.p13_genomic.fna
# copy all file names in trimmed directory and print to text file --> 358 files (179 R1/R2 pairs)
ls /gpfs/alpine/syb105/proj-shared/Data/Mt_Sinai_Brain_Opioid_Omics/chipseq_trimmed >> /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/trimmed.files.txt
# generate key file with path, R1, R2
/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/chip.key.txt
# bwa mem
# git clone https://github.com/lh3/bwa.git
# bwa index ref.fa
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/chip.key.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
d = fields[0]
r1 = fields[1]
r2 = fields[2]
genome = "2862010578"
name = "opioid.chip"
extsize = "147"
fasta = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/bwa/bwa index -a bwtsw /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/ref/GCF_000001405.39_GRCh38.p13_genomic.fna GCF_000001405.39_GRCh38.p13_genomic"
bwa = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/bwa/bwa mem -t 4 /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/ref/GCF_000001405.39_GRCh38.p13_genomic.fna" + " " + "'<zcat " + d + "chipseq_trimmed/trimmed_" + r1 + ".fastq.gz'" + " " + "'<zcat " + d + "chipseq_trimmed/trimmed_" + r2 + ".fastq.gz'" + " " + ">" + " " + "bwa.output/sam/" + r1 + ".bwa.sam"
sort = "samtools sort -@ 4 -O bam -T" + " " + "bwa.output/bam/" + r1 + ".tmp -o" + " " + "bwa.output/" + r1 + ".sorted.bam" + " " + "bwa.output/" + r1 + ".bwa.sam"
index1 = "samtools index" + " " + "bwa.output/bam/" + r1 + ".sorted.bam"
rmdups = "java -Xms512m -Xmx16g -jar /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/picard/build/libs/picard.jar MarkDuplicates -I" + " " + "bwa.output/bam/" + r1 + ".sorted.bam -M" + " " + "rmdups.output/" + r1 + "_report.txt -O" + " " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam --VALIDATION_STRINGENCY SILENT --ASSUME_SORTED true --REMOVE_DUPLICATES true"
index = "samtools index" + " " + "bwa.output/" + r1 + ".rmdups.bam"
unique = "export CHROMOSOMES=$(samtools view -H" + " " + "bwa.output/" + r1 + ".rmdups.bam | grep '^@SQ' | cut -f 2 | grep -v -e _ -e chrM -e chrX -e chrY -e 'VN:' | sed 's/SN://' | xargs echo); samtools view -b -h -f 3 -F 4 -F 8 -F 256 -F 1024 -F 2048 -q 30" + " " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam" + " " + "$CHROMOSOMES >" + " " + "bwa.output/rmdups.uniq.bam/" + r1 + ".rmdups.uniq.bam"
index2 = "samtools index" + " " + "bwa.output/rmdups.uniq.bam/" + r1 + ".rmdups.uniq.bam"
bamtobed = "/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3/bin/bedtools bamtobed -i" + " " + "bwa.output/rmdups.uniq.bed/" + r1 + ".rmdups.uniq.bam >" + " " + "bwa.output/bed/" + r1 + ".rmdups.uniq.bed"
macs2 = "macs2 callpeak -t" + " " + "bwa.output/bed/" + r1 + ".rmdups.uniq.bed -f BED -g" + " " + genome + " " + "--outdir" + " " + "macs.output/" + r1 + ".macs2" + " " + "-n" + " " + name + " " + "--keep-dup all --nomodel --extsize" + " " + extsize
macs = "macs2 callpeak --broad --SPMR -q 0.01 -t" + " " + "bwa.output/bed/" + r1 + ".rmdups.uniq.bed -f BED -g" + " " + genome + " " + "--outdir" + " " + "macs.output/qval/" + r1 + ".macs2" + " " + "-n" + " " + name + " " + "--keep-dup all --nomodel --extsize" + " " + extsize
bamtobigwig = "bamCoverage -b bwa.output/rmdups.bam/" + r1 + ".rmdups.bam" + " -bl macs.output/hg38-blacklist.v2.ensembl.bed -p 2 --effectiveGenomeSize 2862010578 --normalizeUsing CPM -of bigwig -o " + "bwa.output/rmdups.bam/bigwig/" + r1 + ".bw"
print(xx)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/
# python scripts/bwa-mem.key.py > commands/bwa-mem.commands.txt
python scripts/bwa-mem.1.key.py > commands/bwa-mem.1.commands.txt
python scripts/bwa-mem.1.2.key.py > commands/bwa-mem.1.2.commands.txt
python scripts/bwa-mem.1.3.key.py > commands/bwa-mem.1.3.commands.txt
python scripts/bwa-mem.2.key.py > commands/bwa-mem.2.commands.txt
python scripts/bwa-mem.2.andes.key.py > commands/bwa-mem.2.commands.andes.txt
python scripts/bwa-mem.2.summit.key.py > commands/bwa-mem.2.commands.summit.txt
python scripts/bwa-mem.3.key.py > commands/bwa-mem.3.commands.txt
python scripts/bwa-mem.4.key.py > commands/bwa-mem.4.commands.txt
python scripts/bwa-mem.4.1.key.py > commands/bwa-mem.4.1.commands.txt
python scripts/bwa-mem.5.key.py > commands/bwa-mem.5.commands.txt
python scripts/bwa-mem.5.summit.key.py > commands/bwa-mem.5.commands.summit.txt
python scripts/bwa-mem.6.key.py > commands/bwa-mem.6.commands.txt
python scripts/bwa-mem.6.bam.key.py > commands/bwa-mem.6.bam.commands.txt
python scripts/bamtobigwig.key.py > commands/bamtobigwig.commands.txt
python scripts/macs.key.py > commands/macs.commands.txt
# salloc -A SYB105 -p gpu -N 1 -t 1:00:00
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
conda install -c bioconda bwa
conda install -c bioconda samtools
conda install -c bioconda bamtools
conda install -c cyclus java-jdk
conda install -c bioconda picard
conda install -c bioconda macs2
conda install -c biobuilds picard
conda install -c bioconda bedtools
conda install -c biobuilds bedtools
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name fasta --time 48:00:00 --maxpernode 32 --nodes 1 commands/bwa.fasta.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bwa --time 48:00:00 --maxpernode 32 --nodes 6 commands/bwa-mem.1.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name rmdups --time 48:00:00 --maxpernode 4 --nodes 6 commands/bwa-mem.2.commands.andes.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name macs2 --time 48:00:00 --maxpernode 4 --nodes 2 commands/bwa-mem.6.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name bamtobigwig --time 48:00:00 --maxpernode 20 --nodes 2 commands/bamtobigwig.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/andes/submit --name macs2 --time 48:00:00 --maxpernode 4 --nodes 2 commands/macs.commands.txt
# issue with conda: samtools had an issue with not finding a shared library libcrypto.so.1.0.0 --> conda install -c bioconda samtools=1.9 --force-reinstall
# not enough memory in picard on andes... try summit
# bsub -W 00:15 -nnodes 1 -P SYB105 -Is /bin/bash
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondasummit.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/summit/anaconda3
conda install -c krinsman ijavascript
conda install -c biobuilds picard
conda install -c biobuilds samtools
conda install -c biobuilds bedtools
# https://github.com/broadinstitute/picard
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bwa.sort --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.1.2.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name index1 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.1.3.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name rmdups --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.2.commands.summit.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name index --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.3.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name unique --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.4.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name index2 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.4.1.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name bamtobed --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.5.commands.summit.txt
### picard not running
https://github.com/broadinstitute/picard
### to give picard more memory usage
vi `which picard`
default_jvm_mem_opts="-Xms512m -Xmx16g" # add this line to picard script
### job submission not recognizing bedtools in conda environment... give direct path
conda info --envs
### QC: run after bwa-mem alignment and samtools sort steps complete
# samtools stat
filepath = '/gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/chip.key.txt'
with open(filepath) as f:
lines = f.read().splitlines()
for line in lines:
fields = line.strip().split()
d = fields[0]
r1 = fields[1]
r2 = fields[2]
stats = "samtools stats" + " " + "bwa.output/bam/" + r1 + ".sorted.bam > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/samstat/" + r1 + ".samstat.txt"
stats2 = "samtools stats" + " " + "bwa.output/rmdups.bam/" + r1 + ".rmdups.bam > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/samstat/rmdups/" + r1 + ".rmdups.samstat.txt"
stats3 = "samtools stats" + " " + "bwa.output/rmdups.uniq.bam" + r1 + ".rmdups.uniq.bam > /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/samstat/rmdups.uniq" + r1 + ".rmdups.uniq.samstat.txt"
print(stats)
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/
python scripts/bwa-mem.stats.key.py > commands/bwa-mem.stats.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name stats --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.stats.commands.txt
python scripts/bwa-mem.stats2.key.py > commands/bwa-mem.stats2.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name stats2 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.stats2.commands.txt
python scripts/bwa-mem.stats3.key.py > commands/bwa-mem.stats3.commands.txt
/gpfs/alpine/syb105/proj-shared/piet/codebase/summit/submit-3 --name stats3 --time 02:00:00 --maxpernode 20 --nodes 2 commands/bwa-mem.stats3.commands.txt
# multiqc
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/samstat
multiqc .
module load python
source /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/scripts/loadcondaandes.sh
conda activate /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/Libraries/andes/envs/test
cd /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/bwa.output/rmdups.bam/macs.output/merge/peaks
#loadGenome.pl -name hg38.ensembl -org null -fasta /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/ref/GCF_000001405.39_GRCh38.p13_genomic.fna -gtf /gpfs/alpine/syb105/proj-shared/Personal/noshayjm/projects/opioid.chip/ref/GCF_000001405.39_GRCh38.p13_genomic.gtf
annotatePeaks.pl control.olig.merge.macs2.txt hg38.ensembl > control.olig.merge.homer.txt
annotatePeaks.pl control.gaba.merge.macs2.txt hg38.ensembl > control.gaba.merge.homer.txt
annotatePeaks.pl control.glu.merge.macs2.txt hg38.ensembl > control.glu.merge.homer.txt
annotatePeaks.pl heroin.olig.merge.macs2.txt hg38.ensembl > heroin.olig.merge.homer.txt
annotatePeaks.pl heroin.gaba.merge.macs2.txt hg38.ensembl > heroin.gaba.merge.homer.txt
annotatePeaks.pl heroin.glu.merge.macs2.txt hg38.ensembl > heroin.glu.merge.homer.txt
figure 6 has some cool ChIP-seq visualizations they mixed with ATAC-seq to try and find enhancer RNAs: https://www.readcube.com/articles/10.3389/fonc.2021.743840