Basic ATACseq analysis pipeline

Section 1:

Reads Quality Control (QC) pre and post trimming
Alignment
Data preprocessing

1. Preparing Metadata Lists, Checking Read Quality, and Assessing DNA Contamination

# create a list of the fastq files
# Define the path to metadata files
path_metadata="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"

# Extract only the first column (sample names) from metadata.tsv and skip the header (tail -n +2).
# This writes the sample names into a text file.
cut -f 1 $path_metadata/metadata.tsv | tail -n +2 > $path_metadata/list_only_sample_name.txt

# Change directory to the folder containing raw FASTQ reads
cd ~/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/Reads

# List all files in this directory (all FASTQ files), save the list to 'list_fastq_files.txt'
ls > $path_metadata/list_fastq_files.txt

# I used FASTQC to check the reads quality *before trimming*.
path_output="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/QC_reads_pre_trimming"
path_fastq_files="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/Reads/"

# Run FASTQC in parallel on each file listed in 'list_fastq_files.txt' using 4 jobs.
cat $path_metadata/list_fastq_files.txt | parallel -j 4 \
    fastqc -o $path_output -f fastq -t 2 $path_fastq_files/{}

# Download reference genomes for FASTQ Screen (used to check for contamination).
fastq_screen --get_genomes

# Check for DNA contamination in raw reads with FASTQ Screen
path_output="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/Fastq_screen_results/"
path_config="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/FastQ_Screen_Genomes"
path_metadata="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"

# Run fastq_screen in parallel (2 jobs), each using 4 threads. 
cat $path_metadata/list_fastq_files.txt  | parallel -j 2 \
    fastq_screen --threads 4 \
                 --outdir $path_output \
                 --conf $path_config/fastq_screen.conf \
                 --aligner 'bowtie2'

2. Trimming Reads (fastp) and Post-Trimming QC

# Define file paths for sample list, raw FASTQ files, and trimmed FASTQ output
samples_list="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt"
path_fastq_files="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/Reads"
path_trimmed_fastq="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Trimmed_fastq"

# Loop through each sample name in our list
for i in $(cat $samples_list);
  do
    # Use fastp to trim adapters and low-quality bases on both R1 and R2.
    # --thread 9 uses 9 threads
    # --detect_adapter_for_pe automatically detects adapters for paired-end
    # --trim_front1 13  removes 13 bases from the front of R1
    # --trim_tail1 40   removes 40 bases from the tail of R1
    fastp -i $path_fastq_files/"$i"*_R1_001.fastq.gz \
          -I $path_fastq_files/"$i"*_R2_001.fastq.gz \
          -o $path_trimmed_fastq/"$i"_R1_001_trimmed.fastq \
          -O $path_trimmed_fastq/"$i"_R2_001_trimmed.fastq \
          --thread 9 \
          --detect_adapter_for_pe \
          --trim_front1 13  \
          --trim_tail1 40
  done

# After trimming, run FASTQC again to evaluate read quality *post-trimming*.
path_output="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/QC_reads_post_trimming"
path_fastq_files="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Trimmed_fastq"
path_metadata="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"

# Run FASTQC in parallel on the trimmed files.
# 'list_fastq_files_trimmed.txt' presumably contains the list of trimmed FASTQ filenames.
cat $path_metadata/list_fastq_files_trimmed.txt | parallel -j 4 \
    fastqc -o $path_output -f fastq -t 3 $path_fastq_files/{}

3. Alignment to Genome Reference and Marking Duplicates

### First you need to index the genome reference -------------
path_ref="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/References"

# bowtie2-build indexes the reference genome so Bowtie2 can align to it.
# --threads 10 uses 10 CPU threads
bowtie2-build --threads 10 $path_ref/genome.fa $path_ref/genome.fa

### Now perform the alignment ---------------

WD="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/References"
fastq="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Trimmed_fastq"
OUTPUT="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments"

# Loop through each sample ID and run Bowtie2 in paired-end mode.
while read -r ID; 
  do
    bowtie2 -p 10 -q --local \
      -x "$WD"/genome.fa \
      -1 "$fastq"/"$ID"_R1_001_trimmed.fastq \
      -2 "$fastq"/"$ID"_R2_001_trimmed.fastq  \
      | samtools view -bS -@ 10 - \
      | samtools sort -@ 10 - > "$OUTPUT"/"$ID".sorted.bam;
  done < /home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt

### Mark duplicates -----------------
bam="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments"
out="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/mark_duplicate"

while read -r ID;
  do
    picard MarkDuplicates \
      I="$bam"/"$ID".sorted.bam \
      O="$out"/"$ID"_dedup_reads.bam \
      M="$out"/"$ID"_duplication_metrics.txt \
      REMOVE_DUPLICATES=false;
  done < /home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt


## Check the raw bam quality ---------------------
path_out="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/QC_BAM"
path_ref="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/References"
path_bam="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/mark_duplicate"

while read -r ID;
  do
    # Make a subfolder for each sample’s QC metrics.
    mkdir "$path_out"/"$ID"/
    
    # CollectMultipleMetrics from Picard calculates multiple read-level metrics,
    # including alignment summary, insert size distribution, etc.
    picard CollectMultipleMetrics \
        I="$path_bam"/"$ID"_dedup_reads.bam \
        O="$path_out"/"$ID"/ \
        R="$path_ref"/genome.fa;
  done < /home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt

# If desired, samtools stat can be run to gather additional stats
# e.g. samtools stat $path_bam/"$i".md.bam

BAM Filtering Against Blacklist Regions

### Download ENCODE blacklist regions (if not already downloaded)
wget https://github.com/Boyle-Lab/Blacklist/blob/master/lists/hg19-blacklist.v2.bed.gz

#Remove reads within the blacklist regions
path_blacklist="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/ENCODE_blacklist_regions"
path_name_samples="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"
path_mark_duplicate="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/mark_duplicate"
path_out="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/Blacklisted_BAM"

for i in $(cat $path_name_samples/list_only_sample_name.txt);
    do
      # bedtools intersect -v excludes any reads overlapping the blacklist regions.
      bedtools intersect -nonamecheck -v -abam $path_mark_duplicate/"$i"_dedup_reads.bam \
                         -b $path_blacklist/blacklist.v2.bed > $path_out/"$i".tmp.bam
      
      # Sort and index the filtered BAM file
      samtools sort -O bam -o $path_out/"$i".blacklist_filtered.bam $path_out/"$i".tmp.bam -@ 6
      samtools index $path_out/"$i".blacklist_filtered.bam -@ 6
      rm $path_out/"$i".tmp.bam
    done

# stats of bam blacklisted
# If you want to capture stats:
path_name_samples="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT"
path_bam="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Blacklisted_BAM"

for i in $(cat $path_name_samples/list_only_sample_name.txt);
   do
     samtools stat $path_bam/"$i".blacklist_filtered.bam -@ 7 > $path_bam/"$i"_stat.txt
   done

5. Checking and Removing Mitochondrial Reads

Why remove MT reads? ATAC-seq commonly generates a high fraction of mitochondrial reads, especially in human samples. These reads are typically removed (along with any other organellar DNA) before downstream analyses because they can dominate coverage and do not usually inform on nuclear chromatin accessibility.

R snippet to visualize the % of mitochondrial reads:

# ATAC-seq experiments commonly include a high proportion of mitochondrial reads. 
# Use idxstatsBam in R or samtools idxstats in bash to assess % of MT reads.

library(Rsubread)
library(Rsamtools)
library(ggplot2)
library(magrittr)

# Read list of sample names
list_name <- data.table::fread("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt", 
                               sep = "\n", header = FALSE)$V1

p <- list()
for (i in list_name){
   sortedBAM <- paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/Blacklisted_BAM/", 
                       i,".blacklist_filtered.bam")
   # idxstatsBam gives the mapped read count per chromosome
   p[[i]] <- idxstatsBam(sortedBAM) %>% 
     ggplot(aes(seqnames, mapped, fill = seqnames)) + 
     geom_bar(stat = "identity") + 
     coord_flip()
}
p[1]  # For example, display the first plot

Bash commands to remove MT reads:

# Define paths for metadata, no-mito output BAM folder, and the blacklisted BAM folder
path_name_samples="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"
path_bam_no_mito="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM"
path_filtered_bam="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/Blacklisted_BAM"

# Loop over each sample, filter out lines containing "MT" in their read alignment (grep -v "MT")
for i in $(cat $path_name_samples/list_only_sample_name.txt);
   do
     samtools view -h $path_filtered_bam/"$i".blacklist_filtered.bam -@ 4 | \
     grep -v "MT" | \
     samtools sort -O bam -o $path_bam_no_mito/"$i".rmChrM_blacklisted_fduplicates.bam -T .
     
     # Index the resulting no-mito BAM file
     samtools index $path_bam_no_mito/"$i".rmChrM_blacklisted_fduplicates.bam -@ 7
     
     # Get stats if desired
     samtools idxstats $path_bam_no_mito/"$i".rmChrM_blacklisted_fduplicates.bam -@ 7 > \
     $path_bam_no_mito/"$i".rmChrM_blacklisted_fduplicates.bam.stat
   done

(Optional) R snippet to visualize the read distribution after removing MT:

library(Rsubread)
library(Rsamtools)
library(ggplot2)
library(magrittr)

list_name <- data.table::fread("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt",
                               sep = "\n", header = FALSE)$V1

p <- list()
for (i in list_name){
   sortedBAM <- paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM/", 
                       i,".rmChrM_blacklisted_fduplicates.bam")
   p[[i]] <- idxstatsBam(sortedBAM) %>% 
     ggplot(aes(seqnames, mapped, fill = seqnames)) + 
     geom_bar(stat = "identity") + 
     coord_flip()
}
p[1]  # plot example

6. Generating bigWig Files

# Convert BAM files to bigWig for visualization in genome browsers
path_name_samples="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"
path_bam_no_mito="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM"
path_bigwig="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/bigwig_files"
log="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/bigwig_files/log"

for i in $(cat $path_name_samples/list_only_sample_name.txt);
   do
      bamCoverage --bam $path_bam_no_mito/"$i".rmChrM_blacklisted_fduplicates.bam \
                  --outFileName $path_bam_no_mito/"$i".bw \
                  --outFileFormat bigwig \
                  --binSize 10 \
                  --normalizeUsing RPKM \
                  --extendReads \
                  --ignoreDuplicates \
                  -p 8 2> $log/"$i".log
    done

#bamCoverage creates a coverage track (in bigWig format) from the final filtered BAM file. 
#binSize 10 means each bin in the bigWig is 10 bases. 
#normalizeUsing RPKM normalizes for library size and gene length (roughly). 
#extendReads extends reads to their estimated fragment length. 
#ignoreDuplicates does not count PCR duplicates. 
#-p 8 uses 8 threads.

7. Deep QC for Downstream analysis

# Load necessary packages for ATAC-seq QC and analysis.
# ATACseqQC: Provides ATAC-seq-specific QC functions.
# ChIPpeakAnno: Tools for peak annotation and genomic interval manipulation.
# MotifDb: A collection of known motifs (for downstream motif analysis).
# GenomicAlignments: Functions to read BAM files as GAlignments objects.
# BSgenome.Hsapiens.UCSC.hg38: The hg38 (human) reference genome sequences.
# TxDb.Hsapiens.UCSC.hg38.knownGene: Gene/transcript annotations for hg38.
# ggplot2, cowplot: Plotting libraries.
# GenomeInfoDb: Utilities for genome/chromosome metadata.
library(ATACseqQC)
library(ChIPpeakAnno)
library(MotifDb)
library(GenomicAlignments)
library(BSgenome.Hsapiens.UCSC.hg38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
#library(phastCons60way.UCSC.mm10)  # Typically for mouse conservation data; commented out here
library(ggplot2)
library(cowplot)
library(GenomeInfoDb)

# -------------------------------------------
# 1) Reading a list of sample names from a file
# -------------------------------------------
# data.table::fread is a fast reader for text files (CSV, TSV, etc.).
# Here, we have a file containing sample names (one per line), no header.
# 'list' will be a data.table with one column containing sample IDs.
list <- data.table::fread("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt", 
                          header = FALSE)

# Print out the contents to visually inspect (optional).
list

# Loop through each sample name in the data.table
for (i in list$V1) {
  # Construct the full path to the final BAM file for each sample.
  # This BAM is presumably after blacklist removal, duplicate removal, and removing mitochondrial reads.
  bamFile <- paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM/", 
                    i, ".rmChrM_blacklisted_fduplicates.bam")
  
  # Remove the .bam extension for labeling purposes (e.g., for plot titles).
  bamfile.labels <- gsub(".bam", "", basename(bamFile))
  
  # -------------------------------------------------
  # 2) Estimate Library Complexity
  # -------------------------------------------------
  # estimateLibComplexity and readsDupFreq are functions from ATACseqQC that
  # estimate how complex a library is (e.g., duplication rates).
  # We save the plot to a PDF.
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/",
             i, "_Library_complexity.pdf"),
      width = 10, height = 7)
  estimateLibComplexity(readsDupFreq(bamFile))
  dev.off()
  
  # -------------------------------------------------
  # 3) Fragment Size Distribution
  # -------------------------------------------------
  # fragSizeDist from ATACseqQC calculates the distribution of fragment lengths 
  # (often 50–150 bp for nucleosome-free fragments, ~180 bp for mononucleosome, etc.).
  # This helps verify typical ATAC-seq fragment size profiles.
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/",
             i, "_Fragment_size.pdf"),
      width = 10, height = 7)
  fragSizeDist(bamFile, bamfile.labels)
  dev.off()
  
  # -------------------------------------------------
  # 4) Promoter vs Transcript (PT) Score
  # -------------------------------------------------
  # PTscore is a metric that compares the read coverage around promoters vs. the rest 
  # of the transcript body, giving insight into the distribution of coverage.
  
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/",
             i, "_PTscore_plot.pdf"),
      width = 10, height = 7)
  
  # Retrieve all transcripts from the TxDb for hg38
  txs <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
  
  # Read the BAM file into a GAlignments object
  gal <- readGAlignments(bamFile)
  
  # Ensure chromosome naming style matches ("UCSC" style: "chr1", "chr2", etc.).
  seqlevelsStyle(gal) <- "UCSC"
  seqlevelsStyle(txs) <- "UCSC"
  
  # Calculate the Promoter vs Transcript score
  pt <- PTscore(gal, txs)
  
  # Plot the PT score on the y-axis vs. the log2 mean coverage on the x-axis
  plot(pt$log2meanCoverage, pt$PT_score, 
       xlab = "log2 mean coverage",
       ylab = "Promoter vs Transcript")
  dev.off()
  
  # -------------------------------------------------
  # 5) NFRscore (Nucleosome Free Regions score)
  # -------------------------------------------------
  # The NFRscore function compares coverage in regions expected to be nucleosome-free 
  # (e.g., near TSS) vs. nucleosome-occupied regions, providing a measure of how
  # “open” those promoter/TSS regions are.
  
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/",
             i, "_NFRscore_plot.pdf"),
      width = 10, height = 7)
  
  # NFRscore requires the same GAlignments (gal) and transcript definitions (txs).
  nfr <- NFRscore(gal, txs)
  
  # Plot the NFR score vs. coverage
  plot(nfr$log2meanCoverage, nfr$NFR_score, 
       xlab = "log2 mean coverage",
       ylab = "Nucleosome Free Regions score",
       main = "NFRscore for 200bp flanking TSSs",
       xlim = c(-10, 0), ylim = c(-5, 5))
  dev.off()
  
  # -------------------------------------------------
  # 6) TSS Enrichment Score
  # -------------------------------------------------
  # TSS enrichment measures how strongly reads pile up at the TSS (transcription start site). 
  # A higher TSS enrichment score indicates good ATAC-seq data quality (clear TSS peaks).
  
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/",
             i, "_TSSscore_plot.pdf"),
      width = 10, height = 7)
  
  # TSSEscore calculates TSS enrichment and returns a numeric vector of values across bins 
  # around the TSS.
  tsse <- TSSEscore(gal, txs)
  
  # tsse$TSSEscore is the overall TSS enrichment metric
  tsse$TSSEscore
  
  # Plot the coverage profile around the TSS. The x-axis is distance to TSS
  # (from -1000 bp to +1000 bp in increments of 100, for example).
  plot(100*(-9:10-.5), tsse$values, type = "b", 
       xlab = "distance to TSS",
       ylab = "aggregate TSS score")
  dev.off()
}  


# What Each Metric Tells You
# Library Complexity (estimateLibComplexity):
# 
# Estimates the number of unique fragments in your library. A higher complexity is generally better (it means fewer duplicates).
# Fragment Size Distribution (fragSizeDist):
# 
# Shows the distribution of fragment lengths for your ATAC-seq library.
# In ATAC-seq, typical fragment length “peaks” correspond to nucleosome-free fragments (~50–150 bp), mononucleosomes (~180 bp), dinucleosomes (~360 bp), etc.
# Promoter vs Transcript Score (PTscore):
# 
# Compares read coverage in promoter regions vs. the broader transcript body.
# This can reveal whether the library has a strong promoter signal or is more evenly distributed along gene bodies.
# NFRscore (NFRscore):
# 
# Assesses how open the nucleosome-free regions are (usually around TSS).
# A higher score implies stronger “nucleosome-free” signals in promoter regions.
# TSS Enrichment (TSSEscore):
# 
# Measures how strongly reads pile up at TSSs compared to flanking regions.
# A high TSS enrichment (> 6 or 7) is often indicative of good ATAC-seq library prep and data quality.

# Load libraries
library(ATACseqQC)
library(GenomicFeatures)
library(BSgenome.Hsapiens.UCSC.hg38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(ChIPseeker)
library(Rsamtools)
library(GenomicAlignments)
library(GenomicRanges)

# ATACseqQC: The main package providing convenience functions for ATAC-seq quality control (e.g., read shifting, fragment size partitioning, TSS enrichment, etc.).
# ChIPpeakAnno: Tools for genomic interval annotation and manipulation (peak annotation, overlap operations, etc.).
# MotifDb: A database of known motifs (usually used downstream for motif analysis).
# GenomicAlignments, Rsamtools, GenomicRanges: Core Bioconductor packages to read/manage BAM files, aligned reads, and genomic intervals.
# GenomicFeatures, TxDb.Hsapiens.UCSC.hg38.knownGene: Provides a transcript database (TxDb) for hg38, from which we can extract TSS, promoters, exons, transcripts, etc.
# BSgenome.Hsapiens.UCSC.hg38: The human reference genome (hg38) DNA sequences, used for various genome-based analyses.
# ChIPseeker: Another package for annotation and visualization of peaks, often can be used for obtaining promoter regions.
# ggplot2, cowplot: Graphics libraries for plotting.
# GenomeInfoDb: Provides functions to handle genome metadata (like chromosome lengths, naming, etc.).

# Define the path to metadata files
list <- data.table::fread("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata/list_only_sample_name.txt", header = FALSE)
list
for (i in list$V1){
  bamfile <- paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM/", 
                    i, ".rmChrM_blacklisted_fduplicates.bam")
  # readBamFile(): A convenience function from ATACseqQC that reads a BAM file and returns a GAlignmentsList object. With asMates = TRUE, it treats paired-end data as mates.
  gal <- readBamFile(
    bamFile = bamfile,
    asMates = TRUE
  )
  # shiftGAlignmentsList(gal): An ATACseqQC function that “shifts” the read alignments to account for the Tn5 insertion offset. This is essential in ATAC-seq to correctly pinpoint the actual cut sites (the Tn5 transposase offsets the ends of reads by ~4–5 bp). The output gal1 is the shifted version of the alignments.
  gal1 <- shiftGAlignmentsList(gal)
  ## bamfile tags to be read in
  possibleTag <- list(
    "integer"=c("AM","AS","CM","CP","FI","H0","H1","H2", 
                "HI","IH","MQ","NH","NM","OP","PQ","SM","TC","UQ"), 
    "character"=c("BC","BQ","BZ","CB","CC","CO","CQ","CR",
                  "CS","CT","CY","E2","FS","LB","MC","MD","MI","OA","OC","OQ",
                  "OX","PG","PT","PU","Q2","QT","QX","R2","RG","RX","SA","TS","U2")
  )
  
  # Those two vectors—one named `"integer"` and the other named `"character"`—are essentially **a curated list of possible optional SAM/BAM tags** that the code is prepared to look for when parsing reads. 
  # 
  # ### Where do these tags originate?
  # 
  # All of these two-letter codes (e.g. `AS`, `NM`, `RG`, etc.) come from the **SAM/BAM specification** (the standard file format for storing sequencing reads and their alignments). The SAM spec defines many official tags—some are numeric/integer, others are textual/character—and also allows for custom tags.
  # 
  # - **Examples**:
  #   - `NM` (integer) is often “edit distance” or “number of mismatches.”  
  # - `AS` (integer) is sometimes “alignment score.”  
  # - `RG` (character) stands for “read group.”  
  # - `MC` (character) can hold a “mate CIGAR” string.  
  # - `SA` (character) can list “other alignments” for multi-mapped reads.  
  # 
  # Aligners, data pipelines, and tools like Picard or GATK may add or consume these tags to store additional information about each read.
  # 
  # ### Why the code checks them
  # 
  # When running something like `scanBam()` or `readBamFile()`, you can specify which optional tags you’d like to extract. This snippet just enumerates a **broad set** of possible tags that *might* appear in the BAM so the script can see which ones are actually present (i.e., which have nonempty data in the first 100 reads). 
  # 
  # In short, it’s a quick way to discover what metadata your particular BAM file may have stored for each read.
  # 
  library(Rsamtools)
  bamTop100 <- scanBam(BamFile(bamfile, yieldSize = 100),
                       param = ScanBamParam(tag=unlist(possibleTag)))[[1]]$tag
  tags <- names(bamTop100)[lengths(bamTop100)>0]
  tags
  
  # We load a transcript database for hg38 from the package
  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
  
  # Extract TSS regions (+/- 1 kb, for example)
  # The promoterRegions() is a convenience function in ChIPseeker
  # promoterRegions(): A function (from ChIPseeker or ChIPpeakAnno, depending on version) that returns intervals around the TSS: from -1000 bp to +1000 bp. We store these in tssRegions for potential usage.
  tssRegions <- promoterRegions(
    txdb, upstream = 1000, downstream = 1000,
    by = "transcription_start_site"
  )
  
  library(BSgenome.Hsapiens.UCSC.hg38)
  library(phastCons100way.UCSC.hg19)   # Note: This is hg19 (different assembly!)
  library(TxDb.Hsapiens.UCSC.hg38.knownGene)
  
  # Here we see a mix of hg38 references (for the genome and TxDb) plus phastCons100way.UCSC.hg19 (which is actually a conservation track for hg19). This might be just a tutorial example, or maybe they specifically want conservation scores from hg19. Usually, you want the same assembly across the board.
  # txs <- transcripts(...) extracts all transcript coordinates from the TxDb, then filters to chr1 for a smaller test run.
  # genome <- Hsapiens sets the reference genome object to the hg38 BSgenome.Hsapiens.UCSC.hg38.
  
  txs <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
  ## run program for chromosome 1 only
  txs <- txs[seqnames(txs) %in% "chr1"]
  genome <- Hsapiens
  
  ## split the reads into NucleosomeFree, mononucleosome, 
  ## dinucleosome and trinucleosome.
  ## and save the binned alignments into bam files.
  objs <- splitGAlignmentsByCut(gal1, txs=txs, genome=genome, outPath = outPath,
                                conservation=phastCons100way.UCSC.hg19)
  ## list the files generated by splitGAlignmentsByCut.
  dir(outPath)
  
  # splitGAlignmentsByCut(gal1, ...) is an ATACseqQC function that partitions your ATAC-seq alignments by fragment length, typically creating categories:
  # - NucleosomeFree (e.g., 0–100 bp)
  # - mononucleosome (~180–247 bp)
  # - dinucleosome (~315–473 bp)
  # - trinucleosome
  # It writes out separate BAM files for each category in outPath and returns an R list named something like objs. That list has keys for each category.
  # conservation=phastCons100way.UCSC.hg19 optionally adds conservation scores (although we have a mismatch of assemblies here).
  # Finally, dir(outPath) shows which BAM files and logs were created.
  
  # promoters() is a function from GenomicFeatures that returns promoter regions from txdb (±1000 bp around the TSS).
  TSS <-  promoters(txdb, upstream=1000, downstream=1000)
  # unique(TSS): If some transcripts share the same start site, we want to remove duplicates.
  TSS <- unique(TSS)
  # estLibSize(bamfile): A function from ATACseqQC that estimates the total number of aligned reads in the BAM (often for normalization).
  (librarySize <- estLibSize(bamfile))
  
  # Suppose 'TSS' is your promoter GRanges, each ~2 kb wide
  # We want to “shrink” them to a single base at the TSS
  # Many ATACseqQC functions (like enrichedFragments) expect TSS to be a single-nucleotide position (the actual start) so the function can internally extend upstream/downstream.
  # resize(..., width=1, fix="start") creates a 1 bp range anchored at the TSS start coordinate.
  TSS_singleBase <- resize(TSS, width = 1, fix = "start")
  
  
  dws <- ups <- 1010
  sigs <- enrichedFragments(
    gal          = objs[c("NucleosomeFree", "mononucleosome", 
                          "dinucleosome",   "trinucleosome")],
    TSS          = TSS_singleBase, 
    librarySize  = librarySize,
    seqlev       = seqlev,
    TSS.filter   = 0.5,
    n.tile       = NTILE,
    upstream     = ups,
    downstream   = dws
  )
  
  # We set dws and ups both to 1010, meaning we’ll look ±1010 bp around each TSS.
  # gal = objs[c("NucleosomeFree", "mononucleosome", "dinucleosome", "trinucleosome")] picks out the four fragment categories from the list returned by splitGAlignmentsByCut().
  # TSS_singleBase is the set of 1 bp TSS positions.
  # librarySize is used for normalization.
  # seqlev might be a vector specifying which chromosomes to consider.
  # TSS.filter = 0.5 tells the function to discard TSS that have insufficient coverage or are not covered in at least 50% of the region.
  # n.tile = NTILE: The number of bins across the ±1010 region (e.g. 101 bins if NTILE=101).
  # The function returns a list of coverage matrices (one per fragment category) showing the binned coverage around TSS.
  # 
  
  sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1))
  #plot heatmap
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/", i, "_featureAlignedHeatmap.pdf"), width = 10, height = 7)
  featureAlignedHeatmap(
    sigs.log2, 
    reCenterPeaks(TSS, width=ups+dws),
    zeroAt = .5, 
    n.tile = NTILE
  )
  dev.off()
  # Instead of a heatmap for each feature, we want an average coverage line plot across all TSS. This function:
  # Aligns coverage data around each TSS,
  # Averages across all TSS in each bin,
  # Returns a matrix with columns corresponding to each fragment type (NF, mono, etc.) and rows corresponding to each bin.
  
  ## rescale the nucleosome-free and nucleosome signals to 0~1
  range01 <- function(x){(x-min(x))/(max(x)-min(x))}
  out <- apply(out, 2, range01)
  pdf(paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/QC/DeepQC_for_ATAC/", i, "_matplot.pdf"), width = 10, height = 7)
  matplot(out, type="l", xaxt="n", 
          xlab="Position (bp)", 
          ylab="Fraction of signal")
  axis(1, at=seq(0, 100, by=10)+1, 
       labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2)
  abline(v=seq(0, 100, by=10)+1, lty=2, col="gray")
  dev.off()
  
  # range01 function normalizes a vector to [0,1] by subtracting the min and dividing by the range.
  # apply(out, 2, range01): For each column in out (each fragment category), apply the normalization.
  # matplot(out, type="l", ...): Plots all columns as separate lines.
  # xaxt="n" means we manually define the X-axis with axis(1, ...).
  # The X-axis is labeled from -1K to 1K, corresponding to the 101 bins covering 2 kb.
  # abline(...) draws vertical dashed lines to mark every 10 bins for reference.
  # This final plot is a multi-line coverage profile comparing, for instance, nucleosome-free vs. mononucleosomal signals around TSS after normalizing them to the same 0–1 scale.
}

Section 2:

Peaks calling
Differential peaks accessibility analysis
Differential accessible Peaks annotations

1. Peaks calling

# -------------------------------------------------------------------------
# 1) Define paths for:
#    - The metadata listing sample names
#    - The processed BAM files (without mitochondrial reads, blacklisted reads removed, etc.)
#    - The output directory where MACS2 peak-calling results will be stored
# -------------------------------------------------------------------------
path_name_samples="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/INPUT/metadata"
path_bam_no_mito="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM"
path_out="/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Peaks_calling_MACS2"

# -------------------------------------------------------------------------
# 2) Loop through each sample name (one per line) in "list_only_sample_name.txt"
# -------------------------------------------------------------------------
for i in $(cat $path_name_samples/list_only_sample_name.txt);
  do
    # Run MACS2 callpeak for each sample using:
    # -t: The input BAM file (already filtered for no MT, blacklisted regions, duplicates, etc.)
    # --format BAMPE: Indicates the data is PAIRED-END.
    # -g hs: The effective genome size for human (hg38/hg19). 
    #         For mouse, you might use -g mm, etc.
    # -n $i: The name for output files (prefix). This will help you identify which sample the peaks come from.
    # --outdir $path_out: Place all MACS2 peak-calling outputs in $path_out.
    # --keep-dup auto: MACS2 will decide how to handle duplicate reads automatically based on the data.
    # -q 0.01: Use a q-value (FDR) cutoff of 0.01 for significant peaks.
    # --nomodel: Skip the model-building step that MACS2 often uses for typical ChIP-seq data; 
    #            recommended for ATAC-seq or any data where MACS2's model might not apply well.
    # Optional parameters (commented out):
    #    --shift -37 --extsize 73
    #  These can be used if you want to manually shift or extend reads (useful for certain protocols).
    
    macs2 callpeak \
      -t $path_bam_no_mito/"$i".rmChrM_blacklisted_fduplicates.bam \
      --format BAMPE \
      -g hs \
      -n "$i" \
      --outdir $path_out \
      --keep-dup auto \
      -q 0.01 \
      --nomodel 
      #  --shift -37 \
      #  --extsize 73
  done


#Summary of Key Parameters
#-t <bam>: The input treatment file (your filtered ATAC-seq BAM).
#--format BAMPE: Specifies that your data is paired-end.
#-g hs: Effective genome size is set for the human genome (“hs”). Other common options: mm (mouse) or numeric approximations (e.g., --gsize 2.7e9 for human).
#-n <name>: The prefix for MACS2 output files.
#--outdir <directory>: Outputs all results (peaks, summits, logs) to <directory>.
#--keep-dup auto: Automatically determines how to handle duplicate reads.
#-q 0.01: Calls peaks at a false discovery rate (FDR) threshold of 1%.
#--nomodel: Disables MACS2’s fragment size estimation model. Recommended for ATAC-seq, as the Tn5 transposase already fragments DNA in a particular way, making the default model less useful.

2. Differential peaks accessibility analysis

Below is a detailed, line-by-line (or block-by-block) explanation of how each section of this code works and why it’s performed that way. This script is designed to:

Identify a set of non-redundant peaks that occur in at least two samples.
Count reads overlapping those peaks for each sample (using summarizeOverlaps).
Prepare a design matrix and run differential accessibility analysis (limma framework via DaMiRseq pre-processing).
Generate a final result table merging counts, normalized data, and statistics.
Create volcano plots of differentially accessible peaks.

a) Identifying Non-Redundant Peaks

# - Identifying non-redundant peaks:

# First, We will define a set of non-redundant peaks present in at least 2 samples 
# and use these to assess changes in nucleosome free ATACseq signal using DESeq2/limma.
# We take peaks from all samples and reduce them into a single set of non-redundant peaks.
# Then, create a matrix of presence/absence of these peaks over each sample.

# ChIPQC: Contains functions (like GetGRanges) for reading/processing peak files into GRanges objects.
library(ChIPQC)

# 'peaks' is a character vector of file paths to all .narrowPeak files generated 
# by MACS2 (or other peak callers).
peaks <- dir("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Peaks_calling_MACS2", 
             pattern = "*.narrowPeak", full.names = TRUE)

# dir(...): Lists files in the specified directory that match *.narrowPeak.
# full.names = TRUE: Returns the absolute paths to these files.

# Convert each .narrowPeak file into a GRanges object using ChIPQC:::GetGRanges().
# The 'simple = TRUE' parameter typically means it extracts only essential columns 
# (chromosome, start, end, etc.).
myPeaks <- lapply(peaks, ChIPQC:::GetGRanges, simple = TRUE) # myPeaks: A list of GRanges objects, one per file/sample.

# 'unlist(GRangesList(myPeaks))' flattens all GRanges from all samples into a single GRanges object.
# This is the union of all peaks (with potential overlap across samples).
x <- unlist(GRangesList(myPeaks)) # unlist(GRangesList(...)) merges/concatenates all GRanges into a single GRanges.

# 'allPeaksSet_nR' is essentially all the peaks from all samples combined into one set.
allPeaksSet_nR <- x


# We'll build an 'overlap' list to store presence/absence of each merged peak in each sample.
overlap <- list()
# This loop creates one logical vector per sample, indicating for each peak in the merged set whether it overlaps that sample’s peaks.
for (i in 1:length(myPeaks)) {
  # %over%: An operator from the GenomicRanges (or IRanges) ecosystem that tests for overlap between intervals.
  # '%over%' checks whether each peak in 'allPeaksSet_nR' overlaps with the i-th sample's peaks.
  # This returns a logical vector (TRUE if overlap exists, FALSE if not).
  overlap[[i]] <- allPeaksSet_nR %over% myPeaks[[i]]
}


# Combine all logical vectors into a matrix (columns = samples, rows = peaks).
overlapMatrix <- do.call(cbind, overlap)

# Assign column names based on the 'peaks' file names (basename removes the path).
colnames(overlapMatrix) <- basename(peaks)

# Attach this overlap matrix as metadata (mcols) to the GRanges object 'allPeaksSet_nR'. 
# mcols(...): Stores the presence/absence (TRUE/FALSE) data as metadata columns on the GRanges object.
# This means each row/peak in 'allPeaksSet_nR' now has columns indicating which samples contain that peak.
mcols(allPeaksSet_nR) <- overlapMatrix

# View a small subset (first 10 peaks) to verify the presence/absence table.
View(as.data.frame(allPeaksSet_nR[1:10, ]))

b) Counting for Differential ATAC-seq

# We now identify the number of samples in which each peak (row) is present. 
# 'rowSums()' will sum TRUE/FALSE across the row; 
# TRUE counts as 1, FALSE counts as 0 (coerced logically).

library(Rsubread)
occurrences <- rowSums(as.data.frame(elementMetadata(allPeaksSet_nR)))

# elementMetadata(allPeaksSet_nR) returns the mcols(...) from above.
# rowSums(...) with a logical matrix results in a numeric vector counting how many TRUE are in each row.


# Extract only those peaks that occur in at least 2 samples:
nrToCount <- allPeaksSet_nR[occurrences >= 2, ] # This yields a “non-redundant peak set” that’s found in >=2 samples.
View(as.data.frame(nrToCount))

# Now we want to count how many reads from each BAM align within each of these peaks.
# The SummarizedExperiment function 'summarizeOverlaps()' from GenomicAlignments 
# is commonly used for counting reads in specified ranges.

library(GenomicAlignments)

# 'bamsToCount' is a vector of the full paths of each BAM file to be counted.
bamsToCount <- dir("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Alignments/without_Mito_BAM", 
                   full.names = TRUE, pattern = "*.\\.bam$") # pattern = "*.\\.bam$": Regex to match “.bam” extension.


# summarizeOverlaps() takes:
# 1) 'nrToCount': The ranges (our non-redundant peaks).
# 2) 'bamsToCount': The BAM files for counting.
# 3) singleEnd = FALSE: Indicates paired-end data.
myCounts <- summarizeOverlaps(nrToCount, bamsToCount, singleEnd = FALSE) # myCounts is a SummarizedExperiment object with a count matrix and the ranges used.
View(myCounts)


# Here we rename the columns of the SummarizedExperiment 
# for easier identification of each sample.
colnames(myCounts) <- c("1740_230_25",
                        "1740_230_32",
                        "1740_230_33",
                        "1740_230_39",
                        "1740_230_60",
                        "1740_230_66")
# IMPORTANT: The names must match the sample IDs or replicate IDs you expect.

# saveRDS(...) and readRDS(...) store/retrieve R objects from disk.
# Save an object to a file
saveRDS(myCounts, file = "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/myCounts.rds")

# Reload if needed
myCounts <- readRDS(file = "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/myCounts.rds")

c) Preparing Input for the Model

### Create the proper counts table
# Extract row names from the SummarizedExperiment rowRanges
chr_name <- as.data.frame(myCounts@rowRanges@seqnames)
start_end_with <- as.data.frame(myCounts@rowRanges@ranges)

# Extract the counts matrix from 'myCounts' (the SummarizedExperiment).
# 'assay(myCounts)' returns the raw count matrix. We convert it to a data frame for convenience.
counts_matrix <- assay(myCounts) %>% as.data.frame()

# Check dimensions, column sums, etc.
dim(counts_matrix)
colSums(counts_matrix)
head(counts_matrix)

# Combine columns: chromosome name, start/end/width info, and the counts matrix.
Counts_table <- cbind(chr_name, start_end_with, counts_matrix)
colnames(Counts_table)[1] <- "chr"

# This builds a data frame with columns: chr | start | end | width | counts(sample1) | counts(sample2) | ....


# Create the metadata (sample-level information) for differential analysis.
Rep <- factor(c("DIFI",
                "DIFI",
                "SW480",
                "SW480",
                "HCA24",
                "HCA24"))
class <- factor(c("KO",
                  "Control",
                  "KO",
                  "Control",
                  "KO",
                  "Control"))

metaData <- data.frame(Rep, class, row.names = colnames(myCounts)) 
metaData # metaData: A data frame describing each sample’s replicate ID (Rep) and condition/class (class).


# The row names of metaData must match the columns of myCounts so that they can be mapped properly.

d) Build the Model and Perform Differential Analysis

(1) Load and prepare for normalization / modeling

# These libraries are for data transformation, visualization, linear modeling, surrogate variable analysis, etc.
library(reshape2)
library(DaMiRseq)
library(ggplot2)
library(limma)
library(Hmisc)
library(car)
library(corrplot)
library(pheatmap)
library(ggrepel)
library(edgeR)
library(RUVSeq)
library(RColorBrewer)
library(DESeq2)
library(tidyr)
library(tidyverse)
library(ggrepel)

colors <- brewer.pal(3, "Set2")

# A custom negation of '%in%'
'%!in%' <- function(x,y)!('%in%'(x,y))

# These packages handle various steps: data manipulation (tidyverse), normalization (DaMiRseq), linear modeling (limma), etc.


# 'unite()' merges multiple columns into one. We create a unique ID column from chr, start, end, width.
Counts_table_for_stats <- Counts_table %>% unite(col = "ID", c("chr", "start", "end", "width"), sep = "_")

# 'make.unique()' ensures each row name is unique (in case multiple peaks have identical coords).
Counts_table_for_stats$ID <- make.unique(Counts_table_for_stats$ID)
rownames(Counts_table_for_stats) <- Counts_table_for_stats$ID
Counts_table_for_stats <- Counts_table_for_stats[,-1]  # drop the 'ID' column (we moved it to rownames)

fileCounts <- Counts_table_for_stats
fileCovariates <- metaData

# Check that sample columns match row names in metadata
all(colnames(fileCounts) == rownames(fileCovariates))

# Convert these into a SummarizedExperiment object for DaMiRseq pipeline
se_obj <- DaMiR.makeSE(fileCounts, fileCovariates) # DaMiR.makeSE(...): Creates a SummarizedExperiment used by DaMiRseq for normalization/QC.

(2) Normalize raw counts

# DaMiR.normalization performs filtering based on CV, minimum counts, etc., 
# and can perform transformations (like VST) from DESeq2 or logCPM from edgeR.

data_norm <- DaMiR.normalization(
  data = se_obj,
  th.cv = 3,       # threshold for coefficient of variation 
  minCounts = 10,  # minimum read count
  fSample = 0.5,   # fraction of samples that must pass the threshold
  type = 'vst'     # transform type: 'vst' from DESeq2
)

# Plot QC stats after normalization: PCA, distribution, heatmap, RLE boxplot
# These are diagnostic plots to ensure data quality and look for outliers.
DaMiR.Allplot(data_norm, colData(data_norm), what = 'PCA')
DaMiR.Allplot(data_norm, colData(data_norm), what = 'distr')
DaMiR.Allplot(data_norm, colData(data_norm), what = 'heatmap', type = "spearman")
DaMiR.Allplot(data_norm, colData(data_norm), what = 'RLEbox')

(3) Surrogate Variable Analysis

# IMPORTANT: Surrogate variables (SVs) are hidden factors that may cause unwanted variability in data (e.g., batch effects).

#### Considering SV
svs <- DaMiR.SV(data_norm, th.fve = 0.95) # DaMiR.SV(...) finds SVs; DaMiR.SVadjust(...) corrects the data for them.
head(svs)

# Check correlation of surrogate variables with sample covariates (class, replicate)
DaMiR.corrplot(sv = svs, colData(data_norm)[1:2], sig.level = 0.05)

# Adjust for surrogate variables (removes unwanted variation)
data_adjust <- DaMiR.SVadjust(data_norm, svs, n.sv = 2)
head(assay(data_adjust))

# Visualize stats on data after removing the effect of surrogate variables
DaMiR.Allplot(data_adjust, colData(data_adjust), what = 'PCA')
DaMiR.Allplot(data_adjust, colData(data_adjust), what = 'distr')
DaMiR.Allplot(data_adjust, colData(data_adjust), what = 'RLEbox')

(4) Differential Accessibility Analysis (using limma)

### ------------- Do not use adjusted counts -------------

data_model <- as.data.frame(colData(data_norm))

# Build the design matrix with ~0 + class + Rep 
# '0 +' means we do not include an intercept; we get separate columns for each level of 'class' and 'Rep'.
design <- model.matrix(~0 + class + Rep, data = data_model) 

# Make a contrast that compares 'Control' vs. 'KO'
contrast <- makeContrasts(Control.vs.KO = classControl - classKO, levels = design)

# Fit the linear model with limma:
# Steps:
# - lmFit(...): Fit linear model to normalized expression counts.
# - contrasts.fit(...): Apply the contrast (i.e., difference between conditions).
# - eBayes(...): Empirical Bayes smoothing of variance.
# - topTable(...): Summarize results.

fit <- lmFit(assay(data_norm), design)
fit <- contrasts.fit(fit, contrasts = contrast)
fit <- eBayes(fit, trend = TRUE)

# Extract results
res_table <- topTable(fit, coef = 1, adjust = "BH", number = Inf, sort.by = "none")
res_table <- res_table[order(res_table$P.Value),]

# Visual checks with histograms of p-values, logFC
hist(res_table[,4], main="histogram of pValues", breaks=20, col="orange", xlab="pval")
res_table <- res_table[order(res_table$logFC),]
hist(res_table[,1], main="histogram of LogFC values", breaks=20, col="blue", xlab="logFC")


#########################################################################################################################################

### ------------- Use adjusted counts -------------

### IMPORTANT: This second approach incorporates surrogate variables in the design matrix.

data_model <- as.data.frame(cbind(colData(data_norm), svs))  # combine sample covariates + surrogate variables

# Now the design matrix includes 'class', 1 surrogate variable V1, and replicate 'Rep'.
design <- model.matrix(~0 + class + V1 + Rep, data = data_model)
# If there were more significant SVs, you'd add them (V2, V3, ...).

contrast <- makeContrasts(Control.vs.KO = classControl - classKO, levels = design)
fit <- lmFit(assay(data_norm), design)
fit <- contrasts.fit(fit, contrasts = contrast)
fit <- eBayes(fit, trend = TRUE)
res_table <- topTable(fit, coef=1, adjust="BH", number=Inf, sort.by = "none")
res_table <- res_table[order(res_table$adj.P.Val),]

# Quick histograms again
hist(res_table[,5], main="histogram of adj.P.Val", breaks=20, col="orange", xlab="pval")
res_table <- res_table[order(res_table$logFC),]
hist(res_table[,1], main="histogram of LogFC values", breaks=20, col="blue", xlab="logFC")

d) Generate a final result table merging counts, normalized data, and statistics.

### save results
# 'norm_data' is the normalized counts from data_norm, rounded to 3 decimals.
# We prepare each data frame so they can be merged by a common key ID.
norm_data <- round(assay(data_norm),3) %>% as.data.frame()
colnames(norm_data) <- c("Norm_1740_230_25",
                         "Norm_1740_230_32",
                         "Norm_1740_230_33",
                         "Norm_1740_230_39",
                         "Norm_1740_230_60",
                         "Norm_1740_230_66")

fileCounts$ID <- rownames(fileCounts)
rownames(fileCounts) <- NULL
res_table$ID <- rownames(res_table)
rownames(res_table) <- NULL

final_table_results <- left_join(res_table, fileCounts, by="ID") # left_join(...) merges the data frames, matching rows by ID.
norm_data$ID <- rownames(norm_data)
rownames(norm_data) <- NULL
final_table_results <- left_join(final_table_results, norm_data, by="ID")

# Put 'ID' column at the front
final_table_results <- final_table_results %>% relocate(ID, .before=logFC) # relocate(...) reorders the columns to put ID at the beginning.
final_table_results
dim(final_table_results)

### IMPORTANT: We label each peak as “NOT.SIGN”, “SIGN”, or “SIGN_UP/DOWN” depending on p-value, adjusted p-value, and log fold change thresholds.

# Define thresholds
p_th <- 0.05
adjp_th <- 0.05
Beta_th <- 0.5   # logFC threshold

# Create empty vectors for significance calls
sign.lfc.p.adj.vect <- sign.lfc.p.vect <- sign.vect.adj.p <- sign.vect.p <- rep("NOT.SIGN", length(rownames(final_table_results)))

# Based on p-value only
sign.vect.p[which(final_table_results$P.Value <= p_th)] <- "SIGN"

# Based on adjusted p-value
sign.vect.adj.p[which(final_table_results$adj.P.Val <= adjp_th )] <- "SIGN"

# Based on p-value + logFC
sign.lfc.p.vect[which(final_table_results$P.Value <= p_th & final_table_results$logFC >= Beta_th)] <- "SIGN_UP"
sign.lfc.p.vect[which(final_table_results$P.Value <= p_th & final_table_results$logFC <= -Beta_th)] <- "SIGN_DOWN"

# Based on adjusted p-value + logFC
sign.lfc.p.adj.vect[which(final_table_results$adj.P.Val <= adjp_th & final_table_results$logFC >= Beta_th)] <- "SIGN_UP"
sign.lfc.p.adj.vect[which(final_table_results$adj.P.Val <= adjp_th & final_table_results$logFC <= -Beta_th)] <- "SIGN_DOWN"

# Store these categories in the final results table
final_table_results$sign.p <- as.factor(sign.vect.p)
final_table_results$sign.adj.p <- as.factor(sign.vect.adj.p)
final_table_results$sign.lfc.p <- as.factor(sign.lfc.p.vect)
final_table_results$sign.lfc.adj.p <- as.factor(sign.lfc.p.adj.vect)
dim(final_table_results)
head(final_table_results)


# Split the 'ID' column back into 'chr', 'start', 'end', 'width' 
# so we can see genomic coordinates again.
final_table_results <- final_table_results %>% separate(col = "ID", into = c("chr", "start", "end", "width"), sep = "_") # The ID column was chr_start_end_width; we separate it back to 4 columns for clarity.

# Sort by chromosome, remove the 10th column if it’s not needed (likely an artifact).
final_table_results <- final_table_results[order(final_table_results$chr),]
final_table_results <- final_table_results[,-10]
head(final_table_results)

# save final dataset results
data.table::fwrite(final_table_results, 
                   "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/Final_result_table.tsv", 
                   sep = "\t")

# We now have a comprehensive table of results that includes:
# logFC, p-values, adj. p-values, significance annotations, original counts, normalized counts, and genomic coordinates.

e) Create volcano plots of differentially accessible peaks

# ggplot2 for plotting, ggrepel for label repulsion (so labels don’t overlap).
library(ggplot2)
library(ggrepel)

# Volcano plot with raw p-values
# A classic volcano plot highlighting significantly up- or down-accessible peaks.
data <- final_table_results

# Create a 'Significance' column based on p-value < 0.05 and |logFC| > 0.5
data$Significance <- ifelse(data$P.Value < 0.05 & abs(data$logFC) > 0.5,
                            ifelse(data$logFC > 0.5, "Significant Up", "Significant Down"),
                            "Not Significant")

# ggplot of logFC vs. -log10(p-value), color-coded by significance
ggplot(data, aes(x = logFC, y = -log10(P.Value), color = Significance)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "blue") +
  scale_color_manual(values = c("Significant Up" = "red", 
                                "Significant Down" = "blue", 
                                "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Vulcano Plot of Differentially Accessible Peaks",
       x = "log2 Fold Change (logFC)",
       y = "-log10(p-value)") +
  theme(legend.title = element_blank()) +
  scale_x_continuous(limits = c(-2, 2)) + 
  geom_text_repel(
    data = subset(data, Significance != "Not Significant"),
    aes(label = rownames(subset(data, Significance != "Not Significant"))),
    size = 3, max.overlaps = 10
  )

# Volcano plot using adjusted p-values (FDR)
data <- final_table_results
data$Significance <- ifelse(data$adj.P.Val < 0.05 & abs(data$logFC) > 0.5,
                            ifelse(data$logFC > 0.5, "Significant Up", "Significant Down"),
                            "Not Significant")

ggplot(data, aes(x = logFC, y = -log10(adj.P.Val), color = Significance)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "blue") +
  scale_color_manual(values = c("Significant Up" = "red", 
                                "Significant Down" = "blue", 
                                "Not Significant" = "grey")) +
  theme_minimal() +
  labs(title = "Vulcano Plot of Differentially Accessible Peaks",
       x = "log2 Fold Change (logFC)",
       y = "-log10(adj.P.Val)") +
  theme(legend.title = element_blank()) +
  scale_x_continuous(limits = c(-2, 2)) + 
  geom_text_repel(
    data = subset(data, Significance != "Not Significant"),
    aes(label = rownames(subset(data, Significance != "Not Significant"))),
    size = 3, max.overlaps = 10
  )

# Same idea, but we threshold by adjusted p-value (FDR) instead of raw p-values.

3. Visualize peaks of Interest

Below is the fully commented code with detailed explanations of what each section does and why it is there. This script allows you to:

Check a Specific Peak: Manually supply 12 normalized counts for two conditions, visualize a combined bar + boxplot with significance annotation.
Find Peaks near a Gene: Filter a results table for a given gene, pivot the normalized columns to a long format, and test each peak’s difference between KO vs. Control conditions.
Visualize: Show boxplots of normalized read counts, superimpose means, lines, and significance levels (p-value). Facet by peak so multiple peaks near the same gene are easily comparable.

Visualization of a Specific Peak

### VISUALIZE PEAKS OF INTEREST
# # res is presumably a data frame or a GRanges converted to data frame containing peak coordinates and possibly additional info.
res <- final_table_results
# We subset rows where seqnames is “chr9” and the start/end positions match the peak of interest (3016773-3017326).
specific_peak <- res[res$seqnames == "chr9" &  
                           res$start == 3016773 & 
                           res$end == 3017326, ] %>%  as.data.frame()



# Provided data: 12 normalized count values for 12 samples (they match two conditions: KO or Control).
values <- c(0.929, 4.403, 2.059, 2.95, 1.579, 4.647, 1.766, 4.336, 2.902, 3.877, 2.64, 6.239)
samples <- c("Norm_1740_230_01", "Norm_174_230_09", "Norm_174_230_10", "Norm_174_230_17", 
             "Norm_174_230_19", "Norm_174_230_24", "Norm_174_230_42", "Norm_174_230_47", 
             "Norm_174_230_49", "Norm_174_230_53", "Norm_174_230_54", "Norm_174_230_59")

# Create a data frame with the sample names and associated count values.
df <- data.frame(Sample = samples, Count = values)

# Add a 'condition' column specifying whether each sample is KO or Control.
df$condition <- c("KO",
                  "Control",
                  "KO",
                  "Control",
                  "KO",
                  "Control",
                  "KO",
                  "Control",
                  "KO",
                  "Control",
                  "KO",
                  "Control")

# Sort the data frame by condition for convenient plotting order (KO first, then Control).
df_sorted <- df[order(df$condition), ]
print(df_sorted)  # Inspect the sorted data

# df_sorted$Sample ensures that the samples are aligned in the final plot in the order you prefer (first all KO, then Control).
# Turn 'Sample' into a factor, preserving the sorted order so that the bars/boxplots appear in that sequence.
df_sorted$Sample <- factor(df_sorted$Sample, levels = df_sorted$Sample) # # This step prevents default alphabetical sorting in ggplot.



library(ggplot2)
library(dplyr)
library(ggpubr)

# Perform a t-test comparing 'Count' between 'KO' and 'Control'.
t_test_result <- t.test(Count ~ condition, data = df_sorted) # t.test(...): Simple two-sample t-test.

# Extract the p-value from the t-test
p_value <- t_test_result$p.value

# Determine the significance level and corresponding asterisk(s).
# significance: a string “”, “”, “”, or “ns”.
significance <- ifelse(p_value < 0.001, "***",
                       ifelse(p_value < 0.01, "**",
                              ifelse(p_value < 0.05, "*", "ns"))) 

# Position the significance label a bit above the highest bar
y_position <- max(df_sorted$Count) + 0.5 # y_position: Where to place the annotation on the y-axis.

# Plot with bars (geom_col) plus a boxplot overlaid to show distribution
ggplot(df_sorted) +
  aes(x = Sample, y = Count, fill = condition) +
  
  # geom_col() draws the bar chart, grouped by condition using position_dodge.
  geom_col(position = position_dodge(width = 0.8)) +
  
  # geom_boxplot() overlays a boxplot (by condition) for the same data,
  # giving a sense of distribution. 'inherit.aes = FALSE' means we specify new aesthetics.
  geom_boxplot(data = df_sorted, 
               aes(x = condition, y = Count, group = condition), 
               inherit.aes = FALSE, 
               width = 0.3, alpha = 0.5, 
               outlier.shape = NA) +
  
  scale_fill_hue(direction = 1) +  # Change color palette
  theme_minimal() +
  
  # Rotate x-axis labels 45 degrees for readability
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  
  labs(title = "chr9:3016773-3017326",
       y = "Count",
       x = "Samples and Conditions") +
  
  # Add a single text label (the significance) between the two conditions
  geom_text(aes(x = 1.5, y = y_position, label = significance),  # geom_text(...) places the significance asterisk(s).
            inherit.aes = FALSE, size = 6) +
  
  # Add a segment (line) connecting the two groups (KO vs Control)
  geom_segment(aes(x = 1, xend = 2,  # geom_segment(...) draws a horizontal line spanning from x=1 to x=2 (the two groups).
                   y = y_position - 0.1, yend = y_position - 0.1),
               inherit.aes = FALSE, size = 0.5)

Exploring Peaks Near a Gene of Interest

# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggpubr)

# Suppose 'res' is a data frame or tibble with peak information, including 
# a 'SYMBOL' column linking peaks to nearby genes.
gene_of_interest <- "Tcf7l2"
# We filter the res data frame to find rows (peaks) annotated to the gene Tcf7l2 (in SYMBOL column).
peaks_near_gene <- res %>% filter(SYMBOL == gene_of_interest)


# If no peaks found for this gene, stop the script with a message
# A safety check to ensure we have at least one peak.
if (nrow(peaks_near_gene) == 0) {
  stop("No peaks found for the specified gene!")
}


# Identify columns that begin with "Norm_" (which presumably contain normalized counts).
normalized_columns <- grep("^Norm_", colnames(peaks_near_gene), value = TRUE) # grep("^Norm_", ...) finds columns like “Norm_174_...”.

# We'll pivot the relevant columns to a long format (Sample ~ Count).
plot_data <- peaks_near_gene %>%
  select(all_of(normalized_columns)) %>%
  pivot_longer(cols = everything(), names_to = "Sample", values_to = "Count") # pivot_longer(...) converts wide data (one column per sample) into long data (rows for each sample).

# For correct condition labeling, we define a manual mapping from sample names to conditions.
# This ensures that, for each Sample string, we can specify whether it belongs to the KO or Control group.
sample_condition_mapping <- data.frame(
  Sample = c(
    "Norm_1740_230_01", "Norm_174_023_009", "Norm_174_023_010", 
    "Norm_174_023_017", "Norm_174_023_019", "Norm_174_023_024", 
    "Norm_174_023_042", "Norm_174_023_047", "Norm_174_023_049", 
    "Norm_174_023_053", "Norm_174_023_054", "Norm_174_023_059"
  ),
  condition = c(
    "KO", "Control", "KO", "Control", "KO", 
    "Control", "KO", "Control", "KO", 
    "Control", "KO", "Control"
  )
)

# We recreate 'plot_data' but also keep the symbol, seqnames, start, end, 
# and do a pivot on 'Norm_' columns. Then we join the sample_condition_mapping to get conditions.
normalized_columns <- grep("^Norm_", colnames(peaks_near_gene), value = TRUE)
plot_data <- peaks_near_gene %>%
  select(SYMBOL, seqnames, start, end, all_of(normalized_columns)) %>%
  pivot_longer(cols = starts_with("Norm_"), 
               names_to = "Sample", values_to = "Count") %>%
  mutate(Peak = paste0(seqnames, ":", start, "-", end)) %>% # Peak = paste0(seqnames, ":", start, "-", end) creates a textual label for each peak’s genomic coordinate.
  left_join(sample_condition_mapping, by = "Sample")



####### --------------- Adding Significance via T-test for Each Peak ------------------ ###########

# For each Peak, perform a t-test comparing KO vs. Control. 
# Summarise returns a new data frame with one row per peak.
t_test_results <- plot_data %>%
  group_by(Peak) %>%
  summarise(
    p_value = t.test(Count ~ condition)$p.value
  ) %>%
  mutate(
    significance = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01 ~ "**",
      p_value < 0.05 ~ "*",
      TRUE ~ "ns"
    ),
    # We'll put the significance label 0.5 units above the max value in plot_data
    y_position = max(plot_data$Count) + 0.5
  )

# Merge the t-test results back into 'plot_data' so each row (sample-peak combination) 
# has the significance info for that peak.
plot_data <- plot_data %>%
  left_join(t_test_results, by = "Peak")




####### --------------- Plot Each Peak in a Separate Facet ------------------ ###########

# We produce a faceted boxplot: one facet per peak.
ggplot(plot_data, aes(x = condition, y = Count, fill = condition)) +
  
  # geom_boxplot with alpha=0.5
  geom_boxplot(alpha = 0.5, width = 0.5, outlier.shape = NA) +
  
  # Add the mean of each group as a point (shape=18 is a star), plus a connecting line
  # This shows how the mean changes across groups (although there's only 2 groups here).
  stat_summary(
    fun = mean, 
    geom = "point", 
    shape = 18, 
    size = 3, 
    color = "black"
  ) +
  stat_summary(
    fun = mean,
    geom = "line",
    aes(group = 1),  # Connect all points with one line
    color = "black",
    size = 0.8
  ) +
  
  # Add the significance label on the plot. We place it at x=1.5 (between KO and Control).
  geom_text(
    data = t_test_results, 
    aes(x = 1.5, y = y_position, label = significance),
    inherit.aes = FALSE,
    vjust = -0.5
  ) +
  
  # facet_wrap(~ Peak) creates one plot per unique 'Peak'. 
  # scales = "free_y" means each plot can have its own y-axis range.
  facet_wrap(~ Peak, scales = "free_y") + # facet_wrap(~ Peak): Each peak from plot_data$Peak gets its own facet (mini-plot).
  scale_fill_hue(direction = 1) +
  theme_minimal() +
  
  labs(
    title = paste("Box Plot with Mean and Significance for All Peaks near Gene:", gene_of_interest),
    y = "Normalized Counts",
    x = "Condition"
  )

4. DAPs structural annotation

Below is a detailed, line-by-line (or block-by-block) explanation of the code. The script performs peak annotation (via ChIPseeker), visualization (covplot, pie charts, upset plots, annotation bar charts, etc.), and handles merged peak sets for different conditions. Comments explain what each piece does and why.

(a) Loading Packages and Setting Up

## loading packages
library(ggrepel)
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
library(clusterProfiler)

# ggrepel: Improves label placement in ggplot2-based plots (avoids text overlap).
# ChIPseeker: A package for peak annotation, visualization, and integration with other genomic data.
# TxDb.Hsapiens.UCSC.hg38.knownGene: A TxDb object (transcript database) for the human hg38 genome, used by ChIPseeker for annotation (e.g., finding promoters, TSS, gene models).
# txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene loads that database into an object named txdb.
# clusterProfiler: A package for functional enrichment analysis (not heavily used here, but often loaded together with ChIPseeker).

(b) Reading NarrowPeak Files and Inspecting

# files is a character vector of paths to .narrowPeak files.
files <- dir("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Peaks_calling_MACS2",  # dir(...): Lists all files matching *.narrowPeak in the specified directory.
             pattern = "*.narrowPeak",
             full.names = TRUE) # full.names=TRUE ensures we get full file paths instead of just filenames.
print(files)

# We manually assign sample names to each file path. This allows for easier referencing (like files[["1740_230_25"]]).
names(files) <- c("1740_230_25",
                  "1740_230_32",
                  "1740_230_33",
                  "1740_230_39",
                  "1740_230_60",
                  "1740_230_66")

# readPeakFile(...) (from ChIPseeker) reads a MACS2 peak file (here .narrowPeak format) into a GRanges object.
peak <- readPeakFile(files[[6]]) # We pick the 6th file in files (i.e., sample "1740_230_66").

# covplot(...) from ChIPseeker creates a coverage plot of peaks across each chromosome.
covplot(peak, weightCol="V5") # weightCol="V5" indicates the column name (or index) to use as the “weight” (often the peak score or signal).

(c) Checking and Standardizing Seqlevels

#### Checking seqlevels in peaks vs. TxDb
### Before running the annotation, check that the sequence levels match:

# Sequence levels in the peak files
# If peak files have “1, 2, 3” while TxDb has “chr1, chr2, chr3,” we need to harmonize.
peak_seqlevels <- seqlevels(readPeakFile(files[[1]])) # seqlevels(...): Returns the chromosome names (e.g., chr1, chr2, ...).
print(peak_seqlevels)

# Sequence levels in the TxDb
txdb_seqlevels <- seqlevels(txdb)
print(txdb_seqlevels)


# Standardize the sequence levels:
# Use seqlevelsStyle() to update the levels.
library(GenomeInfoDb)

# Change the sequence level style in the peak files
# By setting seqlevelsStyle(peak) <- seqlevelsStyle(txdb), we ensure the peak object’s chromosome style matches the TxDb’s style ("chr1", "chr2", ...).
seqlevelsStyle(peak) <- seqlevelsStyle(txdb) # GenomeInfoDb::seqlevelsStyle(...): A function that sets or gets the chromosome naming style (e.g., “UCSC,” “NCBI,” “ENSEMBL”).

# Check again
print(seqlevels(peak))

(d) Annotating Peaks

# Now perform the peak annotation
# lapply(files, ...): Iterates over each peak file.
# peakAnnoList is a list of csAnno objects (ChIPseeker’s annotation format), one per sample.
peakAnnoList <- lapply(files, function(file) { 
  peak <- readPeakFile(file) # readPeakFile(...): Reads the .narrowPeak file into a GRanges.
  
  seqlevelsStyle(peak) <- seqlevelsStyle(txdb) # seqlevelsStyle(peak) <- seqlevelsStyle(txdb): Standardize chromosome names.
  
  annotatePeak(peak,  # # annotatePeak(...): The core ChIPseeker function that assigns genomic features (promoter, exon, intron, UTR, intergenic), calculates distance to TSS, and can link peak to gene IDs.
               TxDb=txdb, # # TxDb=txdb: uses the loaded TxDb for annotation.
               tssRegion=c(-3000, 3000), # # tssRegion=c(-3000, 3000): define promoter region as 3 kb upstream/downstream of TSS.
               verbose=TRUE) # # verbose=TRUE: prints progress logs.
 
})

(e) Visualize Annotations

### 1) Generating Pie Charts of Annotation ---------

# for (i in names(peakAnnoList)) {...}: Iterates over each named sample in peakAnnoList.
for (i in names(peakAnnoList)) {
  pdf(file = paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/plotAnnoPie_", 
                    i, ".pdf"),   # Output file name
      width = 10, height = 10)
  
  # plotAnnoPie(...): ChIPseeker function that creates a pie chart showing the distribution of peaks across genomic features (promoter, exon, intron, intergenic, etc.).
  plotAnnoPie(peakAnnoList[[i]], 
              main = i, # main = i labels the plot by sample name.
              adj = 0.9, # adj and line tweak text placement.
              line=-4.8)
  
  dev.off()
}

### 2) Generating Upset Plots --------
for (i in names(peakAnnoList)){
  upsetplot(peakAnnoList[[i]], # upsetplot(...): Another ChIPseeker function that shows how peaks overlap with different genomic annotation categories.
            vennpie=TRUE) +  # vennpie=TRUE: Also draws small pie charts in the upset plot intersections.
    ggtitle(i) +  
    theme(plot.title = element_text(hjust = 0.5))
  
  ggsave(
    paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/upsetplot_", i, ".pdf"),
    plot = last_plot(),
    device = NULL,
    path = NULL,
    scale = 1,
    width = 10,
    height = 10,
    units = "in",
    dpi = 300
  )
}


### 3) Plotting Annotation Bar Charts -----------

# The bar chart typically shows the fraction/percentage of peaks in promoters, 5’UTR, exons, etc., for each sample.

pdf(file = "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/plotAnnoBar.pdf",
    width = 10, height = 10)
plotAnnoBar(peakAnnoList, title="Percentage of peaks landing regions") # plotAnnoBar(...) takes a list of annotated peaks (peakAnnoList) and draws a bar plot comparing annotation categories across multiple samples.
dev.off()


### 4) Plotting Distance to TSS ----------

# This helps visualize whether peaks are enriched near transcription start sites.

pdf(file = "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/plotDistToTSS.pdf",
    width = 10, height = 10)
plotDistToTSS(peakAnnoList,
              title="Distribution of transcription factor-binding to TSS") # plotDistToTSS(...): Plots how peaks are distributed relative to the TSS (often a histogram or cumulative distribution).
dev.off()

(f) (Optional) Merged Peaks

### Reading Merged Peak Files

# 1) We list files matching *merged.bed (presumably merged peak files for two conditions).
# 2) Convert that character vector into a list.
# 3) Assign names "KO" and "Control" for easier referencing (like files[["KO"]]).

### MERGED PLOTS

peaks <- dir("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Peaks_calling_MACS2", 
             pattern = "*merged.bed",
             full.names = TRUE)

files <- peaks %>% as.list()

names(files) <- c("KO", "Control")


library(ChIPseeker)

# We read one of the merged peaks.
peak <- readPeakFile(files[[2]])
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene # Again, define txdb and ensure its style is “UCSC.”
seqlevelsStyle(txdb) <- "UCSC"

# Standardize chromosome naming for the peak
library(GenomeInfoDb)
seqlevelsStyle(peak) <- seqlevelsStyle(txdb) # As before, we make sure peak and txdb share naming conventions.


# Annotate and Plot Merged Peaks
# IMPORTANT: We do the same annotation approach, but for each merged peak file (KO, Control).
peakAnnoList <- lapply(files, function(file) {
  peak <- readPeakFile(file)
  seqlevelsStyle(peak) <- seqlevelsStyle(txdb)
  annotatePeak(peak, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=T)
})

# Pie chart
for (i in names(peakAnnoList)){
  pdf(file = paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/plotAnnoPie_", i, ".pdf"),
      width = 10, height = 10)
  plotAnnoPie(peakAnnoList[[i]], main = i, adj = 0.9, line=-4.8) 
  dev.off()
}

# Upset plots for merged peaks, with the same approach as before.
for (i in names(peakAnnoList)){
  upsetplot(peakAnnoList[[i]], vennpie=TRUE) + ggtitle(i) + theme(plot.title = element_text(hjust = 0.5))
  ggsave(
    paste0("/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/upsetplot_", i, ".pdf"),
    plot = last_plot(),
    device = NULL,
    path = NULL,
    scale = 1,
    width = 10,
    height = 10,
    units = "in",
    dpi = 300
  )
}


# A bar chart comparing annotation distribution for the two merged peak sets.

pdf(file = "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/plotAnnoBa_merged.pdf",
    width = 10, height = 10)
plotAnnoBar(peakAnnoList, title="Percentage of peaks landing regions")
dev.off()

# Plot the distance to TSS distribution for the two merged peak sets

pdf(file = "/home/omar/My_lab/ATAC_elena/ATAC_elena_round_2/OUTPUT/Diff_Acc_Peaks_Analysis_Results/plotDistToTSS_merged.pdf",
    width = 10, height = 10)
plotDistToTSS(peakAnnoList,
              title="Distribution of transcription factor-binding to TSS")
dev.off()