Overview of pipeline

### Download rclone

wget https://downloads.rclone.org/v1.54.1/rclone-v1.54.1-osx-amd64.zip
unzip rclone-v1.54.1-osx-amd64.zip

From sample nrs to filename

sed 's/.*/&.Aligned.sortedByCoord.out.cram/' sample_nrs.txt > samples.txt

Downloading the data

$rcloneDir="/hpc/hers_en/wdenooijer/pkgs/rclone"
# Check available files
$rcloneDir/rclone lsf --config="$rcloneDir/nygc_sample.conf" nygc_sample:
# Download all files
vi dl.sh
#!/bin/sh
while read p; do
echo $p
/hpc/hers_en/wdenooijer/pkgs/rclone/rclone copy --config="/hpc/hers_en/wdenooijer/pkgs/rclone/nygc_sample.conf" nygc_sample:"$p" ./counutsdata/cram
done < samples.txt
# save
sbatch -t 24:00:00 dl.sh

Converting CRAM to BAM

module load samtools

cd /hpc/hers_en/wdenooijer/rnadyn/data/countsdata/bam
vi cramsam.sh
#!/bin/sh
while read p; do
echo $p
samtools view -b -h -T /hpc/hers_en/ywang/Reference/Homo_sapiens.GRCh38.dna.primary_assembly.fa /hpc/hers_en/wdenooijer/rnadyn/data/countsdata/cram/"$p" > "$p".bam
done < sample_nrs.txt
# Save
sh cramsam.sh

Define directories

datadir=/hpc/hers_en/wdenooijer/rnadyn/data/
rootdir=/hpc/hers_en/wdenooijer/rnadyn/
cd $datadir

CRIES environment

conda create -n cries
conda activate cries
conda install-c bioconda htseq

Rembrandts environment

conda create -n rembrandts
conda activate rembrandts
conda install -c bioconda bioconductor-deseq
conda install -c r r-gplots
module load R

Getting metadata for samples

# Header of metadata
cat /hpc/hers_en/ywang/NYGC-MetaData/nygc_rna_pheno.txt | head -1 > samples_metadata.txt
# Samples metadata
while read p; do
        echo "$p"
        cat /hpc/hers_en/ywang/NYGC-MetaData/nygc_rna_pheno.txt | grep "$p" >> samples_metadata.txt
done < sample_nrs.txt

# Select column of choice
cut -f 7 -d "|" samples_metadata.txt 

CRIES: Counting Reads for Intronic and Exonic Segments

To prepare files that can be used by REMBRANDTS.

CRIES Step 1: Creating GTF annotation files

Downloading the GTF file

wget ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.chr.gtf.gz 

Creating annotation files (code from CRIES)

# This is the source annotation file
# Note that the annotation file should be in Ensembl format (downloaded from Ensembl FTP)
# Specifically, the chromosome names should lack the "chr" prefix
# Also, the first tag in column 9 must be gene_id, and the third tag must be transcript_id
# Also, the source annotation file must have a transcript_source tage in column 9

input="Homo_sapiens.GRCh38.99.chr.gtf"

# 1. Identify transcripts that are supported by both Ensembl and Havana annotations
# 2. Identify and write constitutive exons, i.e. those that appear in all Ensembl/Havana isoforms of a gene
output="./Homo_sapiens.GRCh38.99.chr.consExons.gtf"
cat $input | grep 'transcript_source "ensembl_havana"' | awk -v FS='\t' '$3=="exon" { exonName=$1":"$4":"$5":"$7; split($9, fields, ";"); geneName=fields[1]; transcriptName=fields[3]; printf("%s\t%s\t%s\n",exonName,geneName,transcriptName); }' | sort | uniq | awk -v FS='\t' '{ eCount[$1]++; tCount[$3]++; exonHost[$1]=$2; if(tCount[$3]==1) gCount[$2]++; } END { for(i in eCount) if(eCount[i]==gCount[exonHost[i]]) { split(i,fields,":"); printf("%s\tensembl_havana\texon\t%s\t%s\t.\t%s\t.\t%s;\n",fields[1],fields[2],fields[3],fields[4],exonHost[i]); } }' > $output

# 1. Identify transcripts that are supported by both Ensembl and Havana annotations
# 2. Find all exons
# 3. Write the coordinate of intronic regions, i.e. the regions that separate two adjacent exons of the same gene
output="./Homo_sapiens.GRCh38.99.chr.Introns.gtf"
cat $input | grep 'transcript_source "ensembl_havana"' | awk -v FS='\t' '$3=="exon" { exonName=$1":"$4":"$5":"$7; split($9, fields, ";"); geneName=fields[1]; transcriptName=fields[3]; printf("%s\t%s\t%s\n",exonName,geneName,transcriptName); }' | sort | uniq | awk -v FS='\t' '{ eCount[$1]++; tCount[$3]++; exonHost[$1]=$2; if(tCount[$3]==1) gCount[$2]++; } END { for(i in eCount) { split(i,fields,":"); printf("%s\tensembl_havana\texon\t%s\t%s\t.\t%s\t.\t%s;\n",fields[1],fields[2],fields[3],fields[4],exonHost[i]); } }' | bedtools sort -i stdin | awk -v FS='\t' '{ if( last_exon[$9]==1 && (last_exon_end[$9]+1)<($4-1) ) printf("%s\t%s\tintron\t%i\t%i\t%s\t%s\t%s\t%s\n",$1,$2,last_exon_end[$9]+1,$4-1,$6,$7,$8,$9); last_exon[$9]=1; last_exon_end[$9]=$5; }' > $output

CRIES Step 2: Mapping Reads - !!Not performed

hisat2 -t -p 16 -x <indexPath> -U <fastqFiles> | samtools sort -T <scratchPath> -o <sampleName>.srt.bam -
samtools view -b -F 1548 -q 30 -o <sampleName>.srt.filtered.bam <sampleName>.srt.bam
rm <sampleName>.srt.bam

CRIES Step 3: Counting reads that map to intronic or exonic segments of each gene

Filter for MAPQ

cp samples.txt /hpc/hers_en/wdenooijer/rnadyn/scripts/rnadyn_counts/
sed -i 's/.cram/.cram.bam/' sample_nrs.txt
cd /hpc/hers_en/wdenooijer/rnadyn/scripts/rnadyn_counts

#!/bin/bash
#SBATCH --job-name=count_exons
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=w.denooijer@students.uu.nl
#SBATCH --ntasks=1
#SBATCH --mem=10gb
#SBATCH --cpus-per-task=1
#SBATCH --time=24:00:00
#SBATCH --output=xx.log
#SBATCH --array=1-24 

datadir=/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/bam/
rootdir=/hpc/hers_en/wdenooijer/

FILES=( $(cat sample_nrs.txt) ) ### Create a bash array of all the filenames you need

PER_TASK=10
START_NUM=$(( ($SLURM_ARRAY_TASK_ID - 1) * $PER_TASK + 1))
END_NUM=$(( $SLURM_ARRAY_TASK_ID * $PER_TASK ))

for (( run=$START_NUM; run<=END_NUM; run++ )); do
    idx=$(($run-1))
    filename=${FILES[$idx]}
    echo Input file is $filename
    htseq-count -m intersection-strict -f bam -t exon -s reverse ${datadir}/${filename} Homo_sapiens.GRCh38.99.chr.consExons.gtf > ${rootdir}/rnadyn/data/countsdata/${filename}.Exons.txt
done

Make output tab delimited

cd countsdata/
for file in *.txt; do cat $file | column -t > $file.tab; done
mkdir counts_bak/
mv *.txt counts_bak/

Create metadata table for REMBRANDTS

Must have 4 columns: Label, File, ReadType, Batch

vi make_table.sh
ls *tab | grep -o "HRA-0...." > t1.txt
ls *tab > t2.txt
paste -d "\t" t1.txt t2.txt > metadata.tab
rm t*.txt
# save
sh make_table.sh
rsync -hav ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/metadata.txt ./

On local: Populate tab file with exonic/intronic alternating in column 3, 1s in column 4. Comma delimited. Column names.

rsync -hav metadata.csv ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/
cat metadata.csv | sed 's/;/\t/g' > metadata.tab
mv metadata.tab ../

Run REMBRANDTS

“Optimizing read count cutoff at stringency 0.99 …”
“Total correlation is 0.628594966124487”
“Total number of genes is 16008” “Maximum correlation is 0.736020795671085”
“Selected threshold is 10.2163901737”
“Number of remaining genes is 2187”
biasMode Linear: Currently, only available.

cd /hpc/hers_en/wdenooijer/rnadyn/scripts/REMBRANDTS/
conda activate rembrandts
module load R
sh ./REMBRANDTS.sh motor_cortex /hpc/hers_en/wdenooijer/rnadyn/data/countsdata/metadata.tab /hpc/hers_en/wdenooijer/rnadyn/data/countsdata 0.99 linear

Output analysis

rsync -hav ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/scripts/REMBRANDTS/out/motor_cortex/* ./

Read counts

cd /hpc/hers_en/wdenooijer/rnadyn/data/countsdata
vi readcounts.sh
#
cut -f 1 -d ' ' CGND-HRA-00055.Aligned.sortedByCoord.out.cram.bam.Exons.txt.tab > exoncounts.txt
for file in *Exons*;
do
awk '{print $2}' $file > x_$file.txt
done
paste x* > exoncounts.txt
rm x*

cut -f 1 -d ' ' CGND-HRA-00055.Aligned.sortedByCoord.out.cram.bam.Introns.txt.tab > introncounts.txt
for file in *Introns*;
do
awk '{print $2}' $file > x_$file.txt
done
paste x* > introncounts.txt
rm x*
#
sh readcounts.sh
rsync