#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=download_reads
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load sra-tools/2.10.9
fastq-dump -I SRR7207011
fastq-dump -I SRR7207017
fastq-dump -I SRR7207089
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=fastp_SRA
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load fastp/intel/0.20.1
fastp -i SRR7207011.fastq -o SRR7207011_filtered.fq --length_required=36
fastp -i SRR7207017.fastq -o SRR7207017_filtered.fq --length_required=36
fastp -i SRR7207089.fastq -o SRR7207089_filtered.fq --length_required=36
#length_required specified as 36 as we want a length between 36 and 65, the length_required is set to 36 which is the minimum length requirement.
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=download_reads
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load bwa/intel/0.7.17
bwa mem -M -t $SLURM_CPUS_PER_TASK /scratch/work/courses/BI7653/hw3.2022/hg38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.normalized.fa SRR7207011_filtered.fq > SRR7207011.sam
bwa mem -M -t $SLURM_CPUS_PER_TASK /scratch/work/courses/BI7653/hw3.2022/hg38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.normalized.fa SRR7207017_filtered.fq > SRR7207017.sam
bwa mem -M -t $SLURM_CPUS_PER_TASK /scratch/work/courses/BI7653/hw3.2022/hg38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.normalized.fa SRR7207089_filtered.fq > SRR7207089.sam
Step 1: Converting each SAM to BAM
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=SAM_BAM
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load samtools/intel/1.14
samtools view -b -h SRR7207011.sam > SRR7207011.bam
samtools view -b -h SRR7207017.sam > SRR7207017.bam
samtools view -b -h SRR7207089.sam > SRR7207089.bam
#Converting SAM to BAM
Step 2: Coordinate sort reads with Picard tools
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --time=8:00:00
#SBATCH --mem=46GB
#SBATCH --job-name=PICARD
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load picard/2.23.8
java -Xmx44g -jar $PICARD_JAR SortSam \
INPUT=SRR7207011.bam \
OUTPUT=SRR7207011_picard.bam \
SORT_ORDER=coordinate \
TMP_DIR="${SLURM_JOBTMP}" \
MAX_RECORDS_IN_RAM=10000000 \
VALIDATION_STRINGENCY=LENIENT
java -Xmx44g -jar $PICARD_JAR SortSam \
INPUT=SRR7207017.bam \
OUTPUT=SRR7207017_picard.bam \
SORT_ORDER=coordinate \
TMP_DIR="${SLURM_JOBTMP}" \
MAX_RECORDS_IN_RAM=10000000 \
VALIDATION_STRINGENCY=LENIENT
java -Xmx44g -jar $PICARD_JAR SortSam \
INPUT=SRR7207089.bam \
OUTPUT=SRR7207089_picard.bam \
SORT_ORDER=coordinate \
TMP_DIR="${SLURM_JOBTMP}" \
MAX_RECORDS_IN_RAM=10000000 \
VALIDATION_STRINGENCY=LENIENT
Step 3: Create a BAM index file for each
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=2
#SBATCH --time=8:00:00
#SBATCH --mem=46GB
#SBATCH --job-name=BAM_index
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load samtools/intel/1.14
samtools index SRR7207011_picard.bam SRR7207011_coordinatesorted.out.index
samtools index SRR7207017_picard.bam SRR7207017_coordinatesorted.out.index
samtools index SRR7207089_picard.bam SRR7207089_coordinatesorted.out.index
Filters out reads with mapping quality less than 20 using the -q option:
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=BAM_filter
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load samtools/intel/1.14
samtools view -b -h -q 20 SRR7207011_picard.bam > SRR7207011_filteredmp.bam
samtools view -b -h -q 20 SRR7207017_picard.bam > SRR7207017_filteredmp.bam
samtools view -b -h -q 20 SRR7207089_picard.bam > SRR7207089_filteredmp.bam
Creating an index file for each of the filtered q20 BAMs
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=2
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=BAM_index
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load samtools/intel/1.14
samtools index SRR7207011_filteredmp.bam SRR7207011_coordinatesorted_filterd.out.index
samtools index SRR7207017_filteredmp.bam SRR7207017_coordinatesorted_filterd.out.index
samtools index SRR7207089_filteredmp.bam SRR7207089_coordinatesorted_filterd.out.index
#!/bin/bash
#
#SBATCH --nodes=1
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --time=8:00:00
#SBATCH --mem=8GB
#SBATCH --job-name=MACS2
#SBATCH --mail-type=FAIL,END
#SBATCH --mail-user=bl2477@nyu.edu
module load macs2/intel/2.2.7.1
macs2 callpeak -t SRR7207011_filteredmp.bam -c SRR7207089_filteredmp.bam -f BAM -g hs -n SRR7207011_filteredmp_macs2 -B -q 0.01
macs2 callpeak -t SRR7207017_filteredmp.bam -c SRR7207089_filteredmp.bam -f BAM -g hs -n SRR7207017_filteredmp_macs2 -B -q 0.01
The main purpose of conducting a transcription factor ChIP-seq experiment is to identify, characterize and analyze regions or binding sites of the genome in which transcription factor binds to; this can also identify any DNA-binding proteins as well. Also, transcription factor ChIP-seq experiment can be utilized to identify the primary sequence of the DNA in which the transcription factor recognizes/binds to in the genome.
#The statements below are correct/true:a,b,c and e
#a. the primary problem with multiply mapped reads is that you don't know where in the genome the reads actually map
#b. discarding multiply mapped reads could introduce false negatives (failure to detect true peaks)
#c. many multiply mapped reads map to segmental duplication regions
#e.including multiply-mapped reads in moderate to highly mappable regions can improve peak identification