title: “DNA FASTQ to BigWig Pipeline Report”

subtitle: “DNAfastqBigWig_human_main_v5_31aug2025.sh Execution Summary”

author: “Automated Pipeline Report”

date: “2026-01-12”

output:

html_document:

toc: true

toc_depth: 3

toc_float: true

theme: bootstrap

highlight: tango

code_folding: hide

df_print: paged

pdf_document:

toc: true

toc_depth: 3

number_sections: true

highlight: tango

df_print: kable

fig_caption: true

keep_tex: false

latex_engine: pdflatex

includes:

  in_header:

    - \usepackage{booktabs}

    - \usepackage{longtable}

    - \usepackage{array}

    - \usepackage{multirow}

    - \usepackage{wrapfig}

    - \usepackage{float}

    - \usepackage{colortbl}

    - \usepackage{pdflscape}

    - \usepackage{tabu}

    - \usepackage{threeparttable}

    - \usepackage{threeparttablex}

    - \usepackage[normalem]{ulem}

    - \usepackage{makecell}

    - \usepackage{xcolor}

geometry: "margin=1in"

fontsize: 11pt

linestretch: 1.2

Executive Summary

This report documents the execution of the DNA FASTQ to BigWig processing pipeline (DNAfastqBigWig_human_main_v5_31aug2025.sh) which performs comprehensive analysis of paired-end DNA sequencing data from raw FASTQ files to normalized genome coverage tracks.

Pipeline Overview

The pipeline executes the following major steps:

  1. Quality Control (Initial) - FastQC analysis of raw FASTQ files

  2. Adapter Trimming - TrimGalore processing for quality and adapter removal

  3. Quality Control (Post-trimming) - FastQC analysis of trimmed files

  4. Genome Alignment - Bowtie2 mapping to hg38 reference genome

  5. Duplicate Removal - Picard deduplication of aligned reads

  6. Coverage Generation - Creation of normalized bedGraph and bigWig files

  7. Quality Reporting - MultiQC summaries at each major step

System Information

Computing Environment

# System information

system_info <- data.frame(

  Parameter = c("Hostname", "User", "Operating System", "R Version", "Report Generated", "Pipeline Script"),

  Value = c(

    "biolserv",
    "micgdu",

    system("uname -a", intern = TRUE),

    R.version.string,

    "2026-01-12 23:54:22",

    "DNAfastqBigWig_human_main_v5_31aug2025.sh"

  ),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(system_info, caption = "System and Environment Information",

        booktabs = TRUE, longtable = FALSE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "hold_position"))

} else {

  kable(system_info, caption = "System and Environment Information")

}
System and Environment Information
Parameter Value
Hostname biolserv
User micgdu
Operating System Linux biolserv 6.8.0-90-generic #91-Ubuntu SMP PREEMPT_DYNAMIC Tue Nov 18 14:14:30 UTC 2025 x86_64 x86_64 x86_64 GNU/Linux
R Version R version 4.3.2 (2023-10-31)
Report Generated 2026-01-12 23:54:22
Pipeline Script DNAfastqBigWig_human_main_v5_31aug2025.sh

Hardware Resources

# Get hardware information

cpu_info <- system("nproc", intern = TRUE)

memory_info <- system("free -h | grep '^Mem:' | awk '{print $2}'", intern = TRUE)

disk_info <- system("df -h . | tail -1 | awk '{print $2\" (\"$4\" available)\"}'", intern = TRUE)

hardware_info <- data.frame(

  Resource = c("CPU Cores", "Total Memory", "Disk Space (Available)"),

  Specification = c(cpu_info, memory_info, disk_info),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(hardware_info, caption = "Hardware Resources",

        booktabs = TRUE, longtable = FALSE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "hold_position"))

} else {

  kable(hardware_info, caption = "Hardware Resources")

}
Hardware Resources
Resource Specification
CPU Cores 144
Total Memory 503Gi
Disk Space (Available) 16T (811G available)

Software Versions

# Check software versions

get_version <- function(cmd) {

  tryCatch({

    result <- system(cmd, intern = TRUE)[1]

    if(is.na(result) || result == "") return("Not available")

    return(result)

  }, error = function(e) "Not available")

}

software_versions <- data.frame(

  Software = c("FastQC", "TrimGalore", "Bowtie2", "Samtools", "Picard", "Bedtools", "MultiQC"),

  Version_Command = c(

    "fastqc --version 2>&1 | head -1",

    "trim_galore --version 2>&1 | head -1",

    "bowtie2 --version 2>&1 | head -1",

    "samtools --version 2>&1 | head -1",

    "java -jar /home/micgdu/software/picard.jar MarkDuplicates --version 2>&1 | head -1",

    "bedtools --version 2>&1 | head -1",

    "multiqc --version 2>&1 | head -1"

  ),

  stringsAsFactors = FALSE

)

software_versions$Version <- sapply(software_versions$Version_Command, get_version)

software_versions$Version_Command <- NULL

# Format table based on output

if(output_format == "latex") {

  kable(software_versions, caption = "Software Versions Used",

        booktabs = TRUE, longtable = FALSE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "hold_position"))

} else {

  kable(software_versions, caption = "Software Versions Used")

}
Software Versions Used
Software Version
FastQC FastQC v0.12.1
TrimGalore Not available
Bowtie2 /home/micgdu/software/bowtie2/bowtie2-2.5.0-linux-x86_64/bowtie2-align-s version 2.5.0
Samtools samtools 1.19.2
Picard Version:3.3.0
Bedtools bedtools v2.31.1
MultiQC multiqc, version 1.30

Pipeline Execution Analysis

Script-Level Resource Usage and Timing

# Define the scripts used in the pipeline with their resource profiles

pipeline_scripts <- data.frame(

  Step = c(

    "1. Initial FastQC",

    "2. TrimGalore",

    "3. Post-trim FastQC",

    "4. Bowtie2 Alignment",

    "5. Picard Deduplication",

    "6. Coverage Generation"

  ),

  Script_Name = c(

    "fastqc_batch_v1_30aug2025.sh",

    "trimgalore_batch_final_v2_30aug2025.sh",

    "fastqc_batch_v1_30aug2025.sh",

    "bowtie2_human_batch_v1_31aug.sh",

    "picard_deduplication_batch_31aug2025_v8.sh",

    "genomecoverage_batch_v1_31aug2025.sh"

  ),

  Lower_Level_Script = c(

    "Built-in FastQC",

    "Built-in TrimGalore",

    "Built-in FastQC",

    "bowtie2_dovetail_pairedEnd_Hsapiens_31Aug25.sh",

    "picard_deduplication_28aug2025.sh",

    "genomeCoverage_DNA_human_26aug2025.sh"

  ),

  Concurrent_Jobs = c("2x parameter", "1x parameter", "2x parameter", "Parameter", "Parameter", "Parameter"),

  Threads_Per_Job = c("10 (fixed)", "8 (fixed)", "10 (fixed)", "15 (Bowtie2)", "1 (Picard)", "10 (samtools)"),

  Memory_Per_Job = c("~2GB", "~4GB", "~2GB", "~8GB", "128GB (Java heap)", "~16GB"),

  Primary_Tool = c("FastQC", "TrimGalore + cutadapt", "FastQC", "Bowtie2 + samtools", "Picard MarkDuplicates", "bedtools + samtools"),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(pipeline_scripts, caption = "Pipeline Scripts and Resource Requirements",

        booktabs = TRUE, longtable = TRUE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "repeat_header"),

                            font_size = 9) %>%

    kableExtra::landscape()

} else {

  kable(pipeline_scripts, caption = "Pipeline Scripts and Resource Requirements")

}
Pipeline Scripts and Resource Requirements
Step Script_Name Lower_Level_Script Concurrent_Jobs Threads_Per_Job Memory_Per_Job Primary_Tool
  1. Initial FastQC
fastqc_batch_v1_30aug2025.sh Built-in FastQC 2x parameter 10 (fixed) ~2GB FastQC
  1. TrimGalore
trimgalore_batch_final_v2_30aug2025.sh Built-in TrimGalore 1x parameter 8 (fixed) ~4GB TrimGalore + cutadapt
  1. Post-trim FastQC
fastqc_batch_v1_30aug2025.sh Built-in FastQC 2x parameter 10 (fixed) ~2GB FastQC
  1. Bowtie2 Alignment
bowtie2_human_batch_v1_31aug.sh bowtie2_dovetail_pairedEnd_Hsapiens_31Aug25.sh Parameter 15 (Bowtie2) ~8GB Bowtie2 + samtools
  1. Picard Deduplication
picard_deduplication_batch_31aug2025_v8.sh picard_deduplication_28aug2025.sh Parameter 1 (Picard) 128GB (Java heap) Picard MarkDuplicates
  1. Coverage Generation
genomecoverage_batch_v1_31aug2025.sh genomeCoverage_DNA_human_26aug2025.sh Parameter 10 (samtools) ~16GB bedtools + samtools

Theoretical Resource Consumption

# Calculate theoretical resource usage based on script parameters

calculate_resources <- function(max_jobs_param = 8) {

  resource_calc <- data.frame(

    Step = pipeline_scripts$Step,

    Jobs = c(

      2 * max_jobs_param, # FastQC: 2x parameter

      1 * max_jobs_param, # TrimGalore: 1x parameter

      2 * max_jobs_param, # FastQC: 2x parameter

      1 * max_jobs_param, # Bowtie2: 1x parameter

      1 * max_jobs_param, # Picard: 1x parameter

      1 * max_jobs_param  # Coverage: 1x parameter

    ),

    Threads_Per_Job = c(10, 8, 10, 15, 1, 10),

    Memory_GB_Per_Job = c(2, 4, 2, 8, 128, 16),

    stringsAsFactors = FALSE

  )

  resource_calc$Total_Threads <- resource_calc$Jobs * resource_calc$Threads_Per_Job

  resource_calc$Total_Memory_GB <- resource_calc$Jobs * resource_calc$Memory_GB_Per_Job

  return(resource_calc)

}

# Calculate for default parameter of 8

resource_usage <- calculate_resources(8)

# Format table based on output

if(output_format == "latex") {

  kable(resource_usage, caption = "Theoretical Resource Usage (max\\_jobs parameter = 8)",

        booktabs = TRUE, longtable = FALSE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "hold_position"),

                            font_size = 10)

} else {

  kable(resource_usage, caption = "Theoretical Resource Usage (max_jobs parameter = 8)")

}
Theoretical Resource Usage (max_jobs parameter = 8)
Step Jobs Threads_Per_Job Memory_GB_Per_Job Total_Threads Total_Memory_GB
  1. Initial FastQC
16 10 2 160 32
  1. TrimGalore
8 8 4 64 32
  1. Post-trim FastQC
16 10 2 160 32
  1. Bowtie2 Alignment
8 15 8 120 64
  1. Picard Deduplication
8 1 128 8 1024
  1. Coverage Generation
8 10 16 80 128
# Summary statistics

peak_threads <- max(resource_usage$Total_Threads)

peak_memory <- max(resource_usage$Total_Memory_GB)

total_thread_hours <- sum(resource_usage$Total_Threads) * 2 # Assuming 2 hours average per step

Peak Resource Requirements:

  • Maximum concurrent threads: 160

  • Maximum memory usage: 1024 GB

  • Total computational thread-hours: ~1184 (estimated for medium dataset)

File System Analysis

Directory Structure

# List the directory structure created by the pipeline

directories <- c(

  "fastQC/",

  "fastQC/fastQC_unTrimmed/",

  "fastQC/fastQC_trimmed/",

  "multiQC/",

  "multiQC/multiQC_unTrimmed/",

  "multiQC/multiQC_trimmed/",

  "multiQC/multiQC_alignments/",

  "multiQC/multiQC_deduplication/",

  "trimmedFastq/",

  "bams/",

  "dedupBams/",

  "bedGraph/",

  "NormBedGraph/",

  "bigwig/"

)

dir_info <- data.frame(

  Directory = directories,

  Purpose = c(

    "FastQC reports root directory",

    "FastQC reports for raw FASTQ files",

    "FastQC reports for trimmed FASTQ files",

    "MultiQC reports root directory",

    "MultiQC summary for raw FASTQ analysis",

    "MultiQC summary for trimmed FASTQ analysis",

    "MultiQC summary for alignment statistics",

    "MultiQC summary for deduplication statistics",

    "Trimmed FASTQ files (*_val_*.fq.gz)",

    "Aligned BAM files (*_sorted_stChr.bam)",

    "Deduplicated BAM files (*_dedup.bam)",

    "Raw bedGraph coverage files",

    "Normalized bedGraph coverage files",

    "BigWig coverage files (*_Snorm.bw)"

  ),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(dir_info, caption = "Pipeline Output Directory Structure",

        booktabs = TRUE, longtable = TRUE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "repeat_header"),

                            font_size = 9)

} else {

  kable(dir_info, caption = "Pipeline Output Directory Structure")

}
Pipeline Output Directory Structure
Directory Purpose
fastQC/ FastQC reports root directory
fastQC/fastQC_unTrimmed/ FastQC reports for raw FASTQ files
fastQC/fastQC_trimmed/ FastQC reports for trimmed FASTQ files
multiQC/ MultiQC reports root directory
multiQC/multiQC_unTrimmed/ MultiQC summary for raw FASTQ analysis
multiQC/multiQC_trimmed/ MultiQC summary for trimmed FASTQ analysis
multiQC/multiQC_alignments/ MultiQC summary for alignment statistics
multiQC/multiQC_deduplication/ MultiQC summary for deduplication statistics
trimmedFastq/ Trimmed FASTQ files (val.fq.gz)
bams/ Aligned BAM files (*_sorted_stChr.bam)
dedupBams/ Deduplicated BAM files (*_dedup.bam)
bedGraph/ Raw bedGraph coverage files
NormBedGraph/ Normalized bedGraph coverage files
bigwig/ BigWig coverage files (*_Snorm.bw)

File Size Analysis

# Function to get file sizes and counts

get_directory_info <- function(dir_path) {

  if (!dir.exists(dir_path)) {

    return(data.frame(

      Directory = basename(dir_path),

      Files = 0,

      Total_Size_GB = 0,

      Avg_Size_MB = 0,

      stringsAsFactors = FALSE

    ))

  }

  files <- list.files(dir_path, recursive = TRUE, full.names = TRUE)

  if (length(files) == 0) {

    return(data.frame(

      Directory = basename(dir_path),

      Files = 0,

      Total_Size_GB = 0,

      Avg_Size_MB = 0,

      stringsAsFactors = FALSE

    ))

  }

  file_info <- file.info(files)

  total_size_bytes <- sum(file_info$size, na.rm = TRUE)

  total_size_gb <- total_size_bytes / (1024^3)

  avg_size_mb <- (total_size_bytes / length(files)) / (1024^2)

  data.frame(

    Directory = basename(dir_path),

    Files = length(files),

    Total_Size_GB = round(total_size_gb, 2),

    Avg_Size_MB = round(avg_size_mb, 2),

    stringsAsFactors = FALSE

  )

}

# Analyze each directory

dir_analysis <- do.call(rbind, lapply(directories, function(d) {

  get_directory_info(file.path(getwd(), d))

}))

# Add totals row

totals <- data.frame(

  Directory = "**TOTAL**",

  Files = sum(dir_analysis$Files),

  Total_Size_GB = sum(dir_analysis$Total_Size_GB),

  Avg_Size_MB = round(mean(dir_analysis$Avg_Size_MB), 2),

  stringsAsFactors = FALSE

)

dir_analysis_with_totals <- rbind(dir_analysis, totals)

# Format table based on output

if(output_format == "latex") {

  kable(dir_analysis_with_totals, caption = "File System Usage Analysis",

        booktabs = TRUE, longtable = FALSE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "hold_position"))

} else {

  kable(dir_analysis_with_totals, caption = "File System Usage Analysis")

}
File System Usage Analysis
Directory Files Total_Size_GB Avg_Size_MB
fastQC 98 0.03 0.28
fastQC_unTrimmed 49 0.01 0.28
fastQC_trimmed 49 0.01 0.28
multiQC 145 0.03 0.21
multiQC_unTrimmed 53 0.01 0.19
multiQC_trimmed 57 0.01 0.18
multiQC_alignments 16 0.01 0.34
multiQC_deduplication 19 0.01 0.28
trimmedFastq 38 18.14 488.87
bams 37 23.00 636.61
dedupBams 71 14.97 215.84
bedGraph 0 0.00 0.00
NormBedGraph 0 0.00 0.00
bigwig 6 1.35 230.80
TOTAL 638 57.58 112.44

Log and Report Analysis

Batch Processing Logs

# Function to find and analyze log directories

find_log_dirs <- function(pattern) {

  log_dirs <- list.dirs(".", recursive = TRUE, full.names = FALSE)

  log_dirs[grepl(pattern, log_dirs)]

}

# Find all batch processing log directories

log_types <- c(

  "fastqc_batch_logs",

  "trimgalore_batch_logs",

  "bowtie2_batch_logs",

  "picard_batch_logs",

  "genomecov_batch_logs"

)

log_summary <- data.frame(

  Log_Type = c(

    "FastQC Batch Logs",

    "TrimGalore Batch Logs",

    "Bowtie2 Batch Logs",

    "Picard Batch Logs",

    "GenomeCov Batch Logs"

  ),

  Pattern = paste0(log_types, "_*"),

  Purpose = c(

    "Individual FastQC job logs, timing, and errors",

    "Individual TrimGalore job logs and statistics",

    "Individual Bowtie2 alignment logs and metrics",

    "Individual Picard deduplication logs and metrics",

    "Individual genome coverage generation logs"

  ),

  Contains = c(

    "Job logs, error logs, main batch log, PID file",

    "Job logs, error logs, main batch log, PID file",

    "Job logs, error logs, main batch log, PID file",

    "Job logs, error logs, main batch log, PID file",

    "Job logs, error logs, main batch log, PID file"

  ),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(log_summary, caption = "Batch Processing Log Structure",

        booktabs = TRUE, longtable = TRUE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "repeat_header"),

                            font_size = 8)

} else {

  kable(log_summary, caption = "Batch Processing Log Structure")

}
Batch Processing Log Structure
Log_Type Pattern Purpose Contains
FastQC Batch Logs fastqc_batch_logs_* Individual FastQC job logs, timing, and errors Job logs, error logs, main batch log, PID file
TrimGalore Batch Logs trimgalore_batch_logs_* Individual TrimGalore job logs and statistics Job logs, error logs, main batch log, PID file
Bowtie2 Batch Logs bowtie2_batch_logs_* Individual Bowtie2 alignment logs and metrics Job logs, error logs, main batch log, PID file
Picard Batch Logs picard_batch_logs_* Individual Picard deduplication logs and metrics Job logs, error logs, main batch log, PID file
GenomeCov Batch Logs genomecov_batch_logs_* Individual genome coverage generation logs Job logs, error logs, main batch log, PID file
# Try to find actual log directories

actual_logs <- character(0)

for (pattern in log_types) {

  found <- find_log_dirs(pattern)

  if (length(found) > 0) {

    actual_logs <- c(actual_logs, found)

  }

}

Found Log Directories: - fastQC/fastQC_trimmed/fastqc_batch_logs_20260112_211938 - fastQC/fastQC_trimmed/fastqc_batch_logs_20260112_211938/individual_jobs - fastQC/fastQC_unTrimmed/fastqc_batch_logs_20260112_210944 - fastQC/fastQC_unTrimmed/fastqc_batch_logs_20260112_210944/individual_jobs - trimmedFastq/trimgalore_batch_logs_20260112_211408 - trimmedFastq/trimgalore_batch_logs_20260112_211408/individual_jobs - bams/bowtie2_batch_logs_20260112_212340 - bams/bowtie2_batch_logs_20260112_212340/individual_jobs - dedupBams/picard_batch_logs_20260112_233428 - dedupBams/picard_batch_logs_20260112_233428/individual_jobs - dedupBams/genomecov_batch_logs_20260112_231003 - dedupBams/genomecov_batch_logs_20260112_231003/individual_jobs - dedupBams/genomecov_batch_logs_20260112_234318 - dedupBams/genomecov_batch_logs_20260112_234318/individual_jobs

Performance Metrics and Summary

Pipeline Efficiency Features

efficiency_features <- data.frame(

  Feature = c(

    "Continuous Job Replacement",

    "Signal Immunity",

    "Comprehensive Logging",

    "Smart Skip Logic",

    "PID Tracking",

    "Progress Monitoring",

    "Resource Optimization"

  ),

  Description = c(

    "New jobs start immediately when others finish, maximizing CPU utilization",

    "Scripts immune to SIGHUP, SIGINT, SIGTERM - safe for remote execution",

    "Detailed logs for each job, main process, and error tracking",

    "Automatically skips files that have already been processed",

    "Tracks process IDs for job management and cleanup",

    "Regular status updates every 5 minutes during execution",

    "Parameterized job counts and thread usage for different system capacities"

  ),

  Benefit = c(

    "Reduced total processing time",

    "Prevents data loss from disconnections",

    "Easy troubleshooting and monitoring",

    "Enables pipeline restart/resume",

    "Clean termination and resource cleanup",

    "Real-time execution monitoring",

    "Optimal resource utilization"

  ),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(efficiency_features, caption = "Pipeline Efficiency and Reliability Features",

        booktabs = TRUE, longtable = TRUE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "repeat_header"),

                            font_size = 8)

} else {

  kable(efficiency_features, caption = "Pipeline Efficiency and Reliability Features")

}
Pipeline Efficiency and Reliability Features
Feature Description Benefit
Continuous Job Replacement New jobs start immediately when others finish, maximizing CPU utilization Reduced total processing time
Signal Immunity Scripts immune to SIGHUP, SIGINT, SIGTERM - safe for remote execution Prevents data loss from disconnections
Comprehensive Logging Detailed logs for each job, main process, and error tracking Easy troubleshooting and monitoring
Smart Skip Logic Automatically skips files that have already been processed Enables pipeline restart/resume
PID Tracking Tracks process IDs for job management and cleanup Clean termination and resource cleanup
Progress Monitoring Regular status updates every 5 minutes during execution Real-time execution monitoring
Resource Optimization Parameterized job counts and thread usage for different system capacities Optimal resource utilization

Final Resource Requirements Summary

# Create final summary table

final_summary <- data.frame(

  Metric = c(

    "Total Pipeline Steps",

    "Core Processing Scripts",

    "Maximum Concurrent Jobs (default)",

    "Peak Thread Usage (default)",

    "Peak Memory Usage (default)",

    "Primary Output File Types"

  ),

  Value = c(

    "7 major steps",

    "8 specialized scripts",

    "16 jobs (2x FastQC steps)",

    paste0(peak_threads, " threads (", peak_threads/10, " FastQC jobs × 10 threads)"),

    paste0(peak_memory, " GB (", peak_memory/128, " Picard jobs × 128GB)"),

    "BigWig, BAM, FastQC reports, MultiQC summaries"

  ),

  stringsAsFactors = FALSE

)

# Format table based on output

if(output_format == "latex") {

  kable(final_summary, caption = "Pipeline Resource Requirements Summary",

        booktabs = TRUE, longtable = FALSE) %>%

    kableExtra::kable_styling(latex_options = c("striped", "hold_position"))

} else {

  kable(final_summary, caption = "Pipeline Resource Requirements Summary")

}
Pipeline Resource Requirements Summary
Metric Value
Total Pipeline Steps 7 major steps
Core Processing Scripts 8 specialized scripts
Maximum Concurrent Jobs (default) 16 jobs (2x FastQC steps)
Peak Thread Usage (default) 160 threads (16 FastQC jobs × 10 threads)
Peak Memory Usage (default) 1024 GB (8 Picard jobs × 128GB)
Primary Output File Types BigWig, BAM, FastQC reports, MultiQC summaries

Complete File Inventory

This section provides a comprehensive inventory of all files generated during the pipeline execution, organized by directory and file type.

# Function to format file sizes

format_size <- function(bytes) {

  if (is.na(bytes) || bytes == 0) return("0 B")

  units <- c("B", "KB", "MB", "GB", "TB")

  unit_index <- min(floor(log(bytes, 1024)) + 1, length(units))

  size_value <- bytes / (1024^(unit_index - 1))

  return(paste0(round(size_value, 2), " ", units[unit_index]))

}

# Function to get detailed file information with sizes (non-recursive for main files only)

get_detailed_file_info <- function(dir_path, base_path = ".") {

  full_path <- file.path(base_path, dir_path)

  

  if (!dir.exists(full_path)) {

    return(list(

      path = dir_path,

      folder_size = "0 B",

      files = data.frame(

        Filename = character(0),

        Size = character(0),

        Type = character(0),

        stringsAsFactors = FALSE

      )

    ))

  }

  # Get folder size (including subdirectories)

  folder_size_cmd <- paste0("du -sb '", full_path, "' 2>/dev/null | cut -f1")

  folder_size_result <- system(folder_size_cmd, intern = TRUE)

  folder_size_bytes <- if(length(folder_size_result) > 0) as.numeric(folder_size_result) else 0

  folder_size <- format_size(folder_size_bytes)

  

  # Get files in directory (NON-recursive to avoid log contamination)

  files <- list.files(full_path, full.names = TRUE, recursive = FALSE)

  files <- files[!dir.exists(files)] # Only files, not subdirectories

  

  if (length(files) == 0) {

    return(list(

      path = dir_path,

      folder_size = folder_size,

      files = data.frame(

        Filename = character(0),

        Size = character(0),

        Type = character(0),

        stringsAsFactors = FALSE

      )

    ))

  }

  # Get file info

  file_info <- file.info(files)

  filenames <- basename(files)

  sizes_bytes <- file_info$size

  sizes_formatted <- sapply(sizes_bytes, format_size)

  

  # Determine file types based on extensions

  get_file_type <- function(filename) {

    ext <- tools::file_ext(tolower(filename))

    if (ext == "") return("No extension")

    switch(ext,

      "fq.gz" = "FASTQ (compressed)",

      "fastq.gz" = "FASTQ (compressed)",

      "bam" = "BAM alignment",

      "bai" = "BAM index",

      "sam" = "SAM alignment",

      "bw" = "BigWig coverage",

      "bigwig" = "BigWig coverage",

      "bedgraph" = "BedGraph coverage",

      "gz" = "Compressed file",

      "html" = "HTML report",

      "zip" = "ZIP archive",

      "txt" = "Text file",

      "log" = "Log file",

      "pid" = "Process ID file",

      "json" = "JSON data",

      "csv" = "CSV data",

      "tsv" = "TSV data",

      "sizes" = "Chromosome sizes",

      paste0(toupper(ext), " file")

    )

  }

  file_types <- sapply(filenames, get_file_type)

  

  # Create data frame

  files_df <- data.frame(

    Filename = filenames,

    Size = sizes_formatted,

    Type = file_types,

    stringsAsFactors = FALSE

  )

  

  # Sort by size (convert back to numeric for sorting)

  size_numeric <- sapply(sizes_bytes, function(x) if(is.na(x)) 0 else x)

  files_df <- files_df[order(size_numeric, decreasing = TRUE), ]

  

  return(list(

    path = dir_path,

    folder_size = folder_size,

    files = files_df

  ))

}

# Define main pipeline directories to analyze (exclude log directories)

main_directories <- c(

  "fastQC/fastQC_unTrimmed",

  "fastQC/fastQC_trimmed", 

  "multiQC/multiQC_unTrimmed",

  "multiQC/multiQC_trimmed",

  "multiQC/multiQC_alignments",

  "multiQC/multiQC_deduplication",

  "trimmedFastq",

  "bams",

  "dedupBams",

  "bigwig"

)

# Get file information for all directories

all_file_info <- lapply(main_directories, get_detailed_file_info)

names(all_file_info) <- main_directories

# Generate output for each directory

for (i in seq_along(all_file_info)) {

  dir_name <- names(all_file_info)[i]

  dir_info <- all_file_info[[i]]

  

  cat("\n## Directory:", dir_name, "\n\n")

  cat("**Folder Path:** `", file.path(getwd(), dir_name), "`  \n")

  cat("**Folder Size:** ", dir_info$folder_size, "  \n\n")

  

  if (nrow(dir_info$files) > 0) {

    if(output_format == "latex") {

      print(kable(dir_info$files, caption = paste("Files in", dir_name),

                  booktabs = TRUE, longtable = FALSE) %>%

        kableExtra::kable_styling(latex_options = c("striped", "hold_position"),

                                font_size = 8))

    } else {

      print(kable(dir_info$files, caption = paste("Files in", dir_name)))

    }

  } else {

    cat("*No files found in this directory.*\n")

  }

  

  cat("\n")

}

Directory: fastQC/fastQC_unTrimmed

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/fastQC/fastQC_unTrimmed
Folder Size: 13.94 MB

No files found in this directory.

Directory: fastQC/fastQC_trimmed

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/fastQC/fastQC_trimmed
Folder Size: 13.67 MB

No files found in this directory.

Directory: multiQC/multiQC_unTrimmed

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/multiQC/multiQC_unTrimmed
Folder Size: 10 MB

Files in multiQC/multiQC_unTrimmed
Filename Size Type
multiQC_unTrimmed.html multiQC_unTrimmed.html 4.92 MB HTML report

Directory: multiQC/multiQC_trimmed

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/multiQC/multiQC_trimmed
Folder Size: 10.02 MB

Files in multiQC/multiQC_trimmed
Filename Size Type
multiQC_trimmed.html multiQC_trimmed.html 4.91 MB HTML report

Directory: multiQC/multiQC_alignments

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/multiQC/multiQC_alignments
Folder Size: 5.46 MB

Files in multiQC/multiQC_alignments
Filename Size Type
multiQC_aligments.html multiQC_aligments.html 4.58 MB HTML report

Directory: multiQC/multiQC_deduplication

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/multiQC/multiQC_deduplication
Folder Size: 5.41 MB

Files in multiQC/multiQC_deduplication
Filename Size Type
multiQC_deduplication.html multiQC_deduplication.html 4.58 MB HTML report

Directory: trimmedFastq

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/trimmedFastq
Folder Size: 18.14 GB

Files in trimmedFastq
Filename Size Type
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R2_001_val_2.fq.gz pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R2_001_val_2.fq.gz 2 GB Compressed file
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz 2 GB Compressed file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz 1.76 GB Compressed file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R2_001_val_2.fq.gz pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R2_001_val_2.fq.gz 1.75 GB Compressed file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R2_001_val_2.fq.gz pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R2_001_val_2.fq.gz 1.52 GB Compressed file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz 1.52 GB Compressed file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R2_001_val_2.fq.gz pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R2_001_val_2.fq.gz 1.47 GB Compressed file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz 1.47 GB Compressed file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R2_001_val_2.fq.gz pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R2_001_val_2.fq.gz 1.17 GB Compressed file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz 1.17 GB Compressed file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R2_001_val_2.fq.gz pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R2_001_val_2.fq.gz 1.16 GB Compressed file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz 1.16 GB Compressed file
–passthrough –passthrough 2.67 MB No extension
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R2_001.fastq.gz_trimming_report.txt pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R2_001.fastq.gz_trimming_report.txt 6.06 KB Text file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R2_001.fastq.gz_trimming_report.txt pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R2_001.fastq.gz_trimming_report.txt 6.04 KB Text file
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R2_001.fastq.gz_trimming_report.txt pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R2_001.fastq.gz_trimming_report.txt 6.04 KB Text file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R2_001.fastq.gz_trimming_report.txt pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R2_001.fastq.gz_trimming_report.txt 6.01 KB Text file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R2_001.fastq.gz_trimming_report.txt pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R2_001.fastq.gz_trimming_report.txt 5.94 KB Text file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R2_001.fastq.gz_trimming_report.txt pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R2_001.fastq.gz_trimming_report.txt 5.9 KB Text file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001.fastq.gz_trimming_report.txt pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001.fastq.gz_trimming_report.txt 5.87 KB Text file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001.fastq.gz_trimming_report.txt pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001.fastq.gz_trimming_report.txt 5.86 KB Text file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001.fastq.gz_trimming_report.txt pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001.fastq.gz_trimming_report.txt 5.85 KB Text file
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001.fastq.gz_trimming_report.txt pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001.fastq.gz_trimming_report.txt 5.84 KB Text file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001.fastq.gz_trimming_report.txt pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001.fastq.gz_trimming_report.txt 5.79 KB Text file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001.fastq.gz_trimming_report.txt pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001.fastq.gz_trimming_report.txt 5.75 KB Text file

Directory: bams

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/bams
Folder Size: 23 GB

Files in bams
Filename Size Type
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam 2.43 GB BAM alignment
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChr.bam pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChr.bam 2.42 GB BAM alignment
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam 2.22 GB BAM alignment
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChr.bam pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChr.bam 2.21 GB BAM alignment
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam 1.89 GB BAM alignment
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChr.bam pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChr.bam 1.88 GB BAM alignment
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam 1.88 GB BAM alignment
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChr.bam pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChr.bam 1.87 GB BAM alignment
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam 1.58 GB BAM alignment
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChr.bam pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChr.bam 1.57 GB BAM alignment
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam 1.53 GB BAM alignment
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChr.bam pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChr.bam 1.52 GB BAM alignment
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChr.bai pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChr.bai 2.18 MB BAM index
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChr.bai pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChr.bai 2.17 MB BAM index
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChr.bai pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChr.bai 2.16 MB BAM index
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChr.bai pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChr.bai 2.13 MB BAM index
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChr.bai pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChr.bai 2.09 MB BAM index
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChr.bai pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChr.bai 2.04 MB BAM index
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.txt pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.txt 643 B Text file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.txt pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.txt 643 B Text file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.txt pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.txt 643 B Text file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.txt pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.txt 642 B Text file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.txt pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.txt 639 B Text file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.txt pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.txt 639 B Text file

Directory: dedupBams

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/dedupBams
Folder Size: 14.97 GB

Files in dedupBams
Filename Size Type
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam 2.49 GB BAM alignment
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam 2.28 GB BAM alignment
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam 1.93 GB BAM alignment
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam 1.92 GB BAM alignment
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam 1.61 GB BAM alignment
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam 1.57 GB BAM alignment
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz 335.96 MB Compressed file
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz 335.88 MB Compressed file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz 318.04 MB Compressed file
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz 317.44 MB Compressed file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz 260.33 MB Compressed file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz 260.26 MB Compressed file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz 259.5 MB Compressed file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz 258.75 MB Compressed file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz 214.01 MB Compressed file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz 213 MB Compressed file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bedGraph.gz 206.21 MB Compressed file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bedGraph.gz 204.9 MB Compressed file
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai 7.11 MB BAM index
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai 7.02 MB BAM index
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai 6.89 MB BAM index
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai 6.82 MB BAM index
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai 6.81 MB BAM index
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bai 6.8 MB BAM index
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai 2.18 MB BAM index
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai 2.18 MB BAM index
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai 2.17 MB BAM index
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai 2.17 MB BAM index
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai 2.15 MB BAM index
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai 2.15 MB BAM index
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai 2.14 MB BAM index
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai 2.13 MB BAM index
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai 2.1 MB BAM index
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai 2.09 MB BAM index
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_SortStChr.bam.bai 2.07 MB BAM index
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam.bai 2.06 MB BAM index
hg38.chrom.sizes hg38.chrom.sizes 11.4 KB Chromosome sizes
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt 2.13 KB Text file
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt 2.13 KB Text file
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt 2.13 KB Text file
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt 2.13 KB Text file
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt 2.13 KB Text file
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup_rep.txt 2.13 KB Text file

Directory: bigwig

Folder Path: /dysk2/groupFolders/micgdu/bioinformatics/run4_Yaarob_hs_LAD/bigwig
Folder Size: 1.35 GB

Files in bigwig
Filename Size Type
pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw pADamID_NTERT_D0_LaminB1_Rep1_S28_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw 290.14 MB BigWig coverage
pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw pADamID_NTERT_D0_LaminAC_rep1_S29_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw 271.22 MB BigWig coverage
pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw pADamID_NTERT_D7_OnlyDam_Rep1_S30_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw 226.98 MB BigWig coverage
pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw pADamID_NTERT_D0_OnlyDam_Rep1_S27_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw 225.05 MB BigWig coverage
pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw pADamID_NTERT_D7_LaminAC_rep1_S32_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw 188.91 MB BigWig coverage
pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw pADamID_NTERT_D7_LaminB1_Rep1_S31_L008_R1_001_val_1.fq.gz.sorted_stChrH.bam_dedup.bam_Snorm.bw 182.49 MB BigWig coverage

File Type Summary

# Aggregate file information by type (only from main pipeline directories)

type_summary <- list()

for (dir_name in names(all_file_info)) {

  dir_info <- all_file_info[[dir_name]]

  if (nrow(dir_info$files) > 0) {

    for (i in 1:nrow(dir_info$files)) {

      file_type <- dir_info$files$Type[i]

      file_size_str <- dir_info$files$Size[i]

      

      # Convert size string back to bytes for aggregation

      size_parts <- strsplit(file_size_str, " ")[[1]]

      size_value <- as.numeric(size_parts[1])

      size_unit <- size_parts[2]

      size_bytes <- switch(size_unit,

        "B" = size_value,

        "KB" = size_value * 1024,

        "MB" = size_value * 1024^2,

        "GB" = size_value * 1024^3,

        "TB" = size_value * 1024^4,

        size_value

      )

      

      if (file_type %in% names(type_summary)) {

        type_summary[[file_type]]$count <- type_summary[[file_type]]$count + 1

        type_summary[[file_type]]$total_bytes <- type_summary[[file_type]]$total_bytes + size_bytes

        type_summary[[file_type]]$locations <- unique(c(type_summary[[file_type]]$locations, dir_name))

      } else {

        type_summary[[file_type]] <- list(

          count = 1,

          total_bytes = size_bytes,

          locations = dir_name

        )

      }

    }

  }

}

# Convert to data frame

if (length(type_summary) > 0) {

  type_summary_df <- data.frame(

    File_Type = names(type_summary),

    Count = sapply(type_summary, function(x) x$count),

    Total_Size = sapply(type_summary, function(x) format_size(x$total_bytes)),

    Locations = sapply(type_summary, function(x) paste(x$locations, collapse = ", ")),

    stringsAsFactors = FALSE

  )

  

  # Sort by total size (descending)

  size_bytes_for_sort <- sapply(type_summary, function(x) x$total_bytes)

  type_summary_df <- type_summary_df[order(size_bytes_for_sort, decreasing = TRUE), ]

  

  if(output_format == "latex") {

    kable(type_summary_df, caption = "File Type Summary - Pipeline Files Only",

          booktabs = TRUE, longtable = TRUE) %>%

      kableExtra::kable_styling(latex_options = c("striped", "repeat_header"),

                              font_size = 8)

  } else {

    kable(type_summary_df, caption = "File Type Summary - Pipeline Files Only")

  }

} else {

  cat("*No files found for type summary.*\n")

}
File Type Summary - Pipeline Files Only
File_Type Count Total_Size Locations
BAM alignment BAM alignment 18 34.8 GB bams, dedupBams
Compressed file Compressed file 24 21.26 GB trimmedFastq, dedupBams
BigWig coverage BigWig coverage 6 1.35 GB bigwig
BAM index BAM index 24 79.81 MB bams, dedupBams
HTML report HTML report 4 18.99 MB multiQC/multiQC_unTrimmed, multiQC/multiQC_trimmed, multiQC/multiQC_alignments, multiQC/multiQC_deduplication
No extension No extension 1 2.67 MB trimmedFastq
Text file Text file 24 87.49 KB trimmedFastq, bams, dedupBams
Chromosome sizes Chromosome sizes 1 11.4 KB dedupBams

Storage Summary

# Calculate total storage used by main pipeline directories only

total_bytes <- 0

directory_sizes <- data.frame(

  Directory = character(0),

  Size_GB = numeric(0),

  Percentage = numeric(0),

  stringsAsFactors = FALSE

)

for (dir_name in names(all_file_info)) {

  dir_info <- all_file_info[[dir_name]]

  

  # Convert folder size to bytes

  size_str <- dir_info$folder_size

  if (size_str != "0 B") {

    size_parts <- strsplit(size_str, " ")[[1]]

    size_value <- as.numeric(size_parts[1])

    size_unit <- size_parts[2]

    size_bytes <- switch(size_unit,

      "B" = size_value,

      "KB" = size_value * 1024,

      "MB" = size_value * 1024^2,

      "GB" = size_value * 1024^3,

      "TB" = size_value * 1024^4,

      size_value

    )

    

    total_bytes <- total_bytes + size_bytes

    directory_sizes <- rbind(directory_sizes, data.frame(

      Directory = dir_name,

      Size_GB = round(size_bytes / (1024^3), 3),

      Percentage = 0, # Will calculate after total

      stringsAsFactors = FALSE

    ))

  }

}

# Calculate percentages

if (nrow(directory_sizes) > 0) {

  directory_sizes$Percentage <- round((directory_sizes$Size_GB * 1024^3 / total_bytes) * 100, 1)

  directory_sizes <- directory_sizes[order(directory_sizes$Size_GB, decreasing = TRUE), ]

  

  if(output_format == "latex") {

    kable(directory_sizes, caption = "Storage Usage by Directory - Pipeline Files Only",

          booktabs = TRUE, longtable = FALSE) %>%

      kableExtra::kable_styling(latex_options = c("striped", "hold_position"))

  } else {

    kable(directory_sizes, caption = "Storage Usage by Directory - Pipeline Files Only")

  }

  

  cat("\n**Total Pipeline Storage Usage:** ", format_size(total_bytes), "\n")

} else {

  cat("*No directory size information available.*\n")

}
## 
## **Total Pipeline Storage Usage:**  57.52 GB

Conclusion

This automated DNA sequencing analysis pipeline provides a comprehensive workflow from raw FASTQ files to normalized genome coverage tracks. Key achievements include:

  • Automated Quality Control: Multi-stage QC with FastQC and MultiQC reporting

  • Robust Processing: Signal-immune batch processing with comprehensive logging

  • Scalable Architecture: Parameterized resource allocation for different system capacities

  • Production Ready: Skip logic enables restart/resume capabilities

Recommendations

  • Minimum system: 32 cores, 256GB RAM, 1TB+ storage

  • Optimal system: 64+ cores, 512GB+ RAM, fast SSD storage

  • Monitor disk space: Intermediate files can be 3-5x input size

  • Use parameter tuning based on available resources

Generated Files and Locations

All output files are organized in the specified output directory with the following structure:

  • Analysis Results: BAM files, coverage tracks, quality reports

  • Logs and Metrics: Comprehensive logging for troubleshooting and monitoring

  • Intermediate Files: Trimmed FASTQ files, alignment statistics, duplication metrics


Report generated automatically by the DNAfastqBigWig pipeline reporting system.

For questions about this pipeline, consult the individual script documentation and log files.