datadir=/hpc/hers_en/ywang/featureCount/motor_sample_batch1/
rootdir=/hpc/hers_en/wdenooijer/rnadyn
cd $rootdir/data/
wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh
sh Miniconda2-latest-Linux-x86_64.sh
conda create -n cries
conda activate cries
conda install-c bioconda htseq
conda create -n rembrandts
conda activate rembrandts
conda install -c bioconda bioconductor-deseq
conda install -c r r-gplots
module load R
# Getting sample nrs
ls $datadir | grep -o "HRA-0...." | sort -u > sample_nrs.txt
# vi sample_nrs.txt
# add -2 behind 00084 and 00089
# Optional: Get metadata for these samples
# Header of metadata
cat /hpc/hers_en/ywang/NYGC-MetaData/nygc_rna_pheno.txt | head -1 > samples_metadata.txt
# Samples metadata
while read p; do
echo "$p"
cat /hpc/hers_en/ywang/NYGC-MetaData/nygc_rna_pheno.txt | grep "$p" >> samples_metadata.txt
done < sample_nrs.txt
# Select column of choice
cut -f 7 -d "|" samples_metadata.txt
To prepare files that can be used by REMBRANDTS.
Downloading the GTF file
wget ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.chr.gtf.gz
Creating annotation files (code from CRIES)
# This is the source annotation file
# Note that the annotation file should be in Ensembl format (downloaded from Ensembl FTP)
# Specifically, the chromosome names should lack the "chr" prefix
# Also, the first tag in column 9 must be gene_id, and the third tag must be transcript_id
# Also, the source annotation file must have a transcript_source tage in column 9
input="Homo_sapiens.GRCh38.99.chr.gtf"
# 1. Identify transcripts that are supported by both Ensembl and Havana annotations
# 2. Identify and write constitutive exons, i.e. those that appear in all Ensembl/Havana isoforms of a gene
output="./Homo_sapiens.GRCh38.99.chr.consExons.gtf"
cat $input | grep 'transcript_source "ensembl_havana"' | awk -v FS='\t' '$3=="exon" { exonName=$1":"$4":"$5":"$7; split($9, fields, ";"); geneName=fields[1]; transcriptName=fields[3]; printf("%s\t%s\t%s\n",exonName,geneName,transcriptName); }' | sort | uniq | awk -v FS='\t' '{ eCount[$1]++; tCount[$3]++; exonHost[$1]=$2; if(tCount[$3]==1) gCount[$2]++; } END { for(i in eCount) if(eCount[i]==gCount[exonHost[i]]) { split(i,fields,":"); printf("%s\tensembl_havana\texon\t%s\t%s\t.\t%s\t.\t%s;\n",fields[1],fields[2],fields[3],fields[4],exonHost[i]); } }' > $output
# 1. Identify transcripts that are supported by both Ensembl and Havana annotations
# 2. Find all exons
# 3. Write the coordinate of intronic regions, i.e. the regions that separate two adjacent exons of the same gene
output="./Homo_sapiens.GRCh38.99.chr.Introns.gtf"
cat $input | grep 'transcript_source "ensembl_havana"' | awk -v FS='\t' '$3=="exon" { exonName=$1":"$4":"$5":"$7; split($9, fields, ";"); geneName=fields[1]; transcriptName=fields[3]; printf("%s\t%s\t%s\n",exonName,geneName,transcriptName); }' | sort | uniq | awk -v FS='\t' '{ eCount[$1]++; tCount[$3]++; exonHost[$1]=$2; if(tCount[$3]==1) gCount[$2]++; } END { for(i in eCount) { split(i,fields,":"); printf("%s\tensembl_havana\texon\t%s\t%s\t.\t%s\t.\t%s;\n",fields[1],fields[2],fields[3],fields[4],exonHost[i]); } }' | bedtools sort -i stdin | awk -v FS='\t' '{ if( last_exon[$9]==1 && (last_exon_end[$9]+1)<($4-1) ) printf("%s\t%s\tintron\t%i\t%i\t%s\t%s\t%s\t%s\n",$1,$2,last_exon_end[$9]+1,$4-1,$6,$7,$8,$9); last_exon[$9]=1; last_exon_end[$9]=$5; }' > $output
hisat2 -t -p 16 -x <indexPath> -U <fastqFiles> | samtools sort -T <scratchPath> -o <sampleName>.srt.bam -
samtools view -b -F 1548 -q 30 -o <sampleName>.srt.filtered.bam <sampleName>.srt.bam
rm <sampleName>.srt.bam
# Exons
vi $rootdir/scripts/count_exon_reads.sh
# Paste in count_exon_reads.sh:
#!/bin/sh
datadir=/hpc/hers_en/ywang/featureCount/motor_sample_batch1/
rootdir=/hpc/hers_en/wdenooijer/
while read p; do
for i in $p; do
echo $p
htseq-count -m intersection-strict -f bam -t exon -s reverse $datadir/CGND-"$p".Aligned.sortedByCoord.out.bam Homo_sapiens.GRCh38.99.chr.consExons.gtf > $rootdir/data/countsdata/rawout/CGND-"$p".consExons.counts.txt
done
done < sample_nrs.txt
# Save
sbatch -t 48:00:00 $rootdir/scripts/count_exon_reads.sh
# Introns
vi $rootdir/scripts/count_intron_reads.sh
# Paste in count_intron_reads.sh
#!/bin/sh
datadir=/hpc/hers_en/ywang/featureCount/motor_sample_batch1/
rootdir=/hpc/hers_en/wdenooijer/
while read p; do
for i in $p; do
echo $p
htseq-count -m union -f bam -t intron -s reverse $datadir/CGND-"$p".Aligned.sortedByCoord.out.bam Homo_sapiens.GRCh38.99.chr.Introns.gtf > $rootdir/data/countsdata/rawout/CGND-"$p".Introns.counts.txt
done
done < sample_nrs.txt
# Save
sbatch -t 48:00:00 $rootdir/scripts/count_intron_reads.sh
cd countsdata/
for file in *.txt; do cat $file | column -t > $file.tab; done
mkdir bak/
mv *.txt bak/
Must have 4 columns: Label, File, ReadType, Batch
vi make_table.sh
ls *tab | grep -o "HRA-0...." > t1.txt
ls *tab > t2.txt
paste -d "\t" t1.txt t2.txt > metadata.tab
rm t*.txt
# save
sh make_table.sh
rsync -hav ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/metadata.txt ./
On local: Populate tab file with exonic/intronic alternating in column 3, 1s in column 4. Comma delimited. Column names.
rsync -hav metadata.csv ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/
cat metadata.csv | sed 's/;/\t/g' > metadata.tab
mv metadata.tab ../
Stringency 0.99 -> 2019 genes in analysis biasMode Linear: Currently, only available.
cd /hpc/hers_en/wdenooijer/rnadyn/scripts/REMBRANDTS/
conda activate rembrandts
module load R
sh ./REMBRANDTS.sh test /hpc/hers_en/wdenooijer/rnadyn/data/metadata.tab /hpc/hers_en/wdenooijer/rnadyn/data/countsdata 0.99 linear
rsync -hav ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/scripts/REMBRANDTS/out/test/* ./
The estimated log2 of abundance of exonic fragments (Δexon), relative to the average of all samples. Assumption: changes in Δexon correspond to changes in steady-state mRNA levels.
exons <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn_test/exonic.filtered.mx.txt")
exons_hm <- as.matrix(exons[,-1])
library(RColorBrewer)
coul <- colorRampPalette(rev(brewer.pal(8, "RdYlBu")))(26)
library(pheatmap)
pheatmap(exons_hm, scale="row",col=coul)
The heatmap of Pearson similarities of samples with respect to the above estimates.
The estimated log2 of abundance of intronic fragments (Δintron), relative to the average of all samples. Assumption: Changes in Δintron correspond to changes in transcription rate.
introns <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn_test/intronic.filtered.mx.txt")
introns_hm <- as.matrix(introns[,-1])
#coul <- colorRampPalette(rev(brewer.pal(8, "RdYlBu")))(26)
pheatmap(introns_hm, scale="row",col=coul)
The heatmap of Pearson similarities of samples with respect to the above estimates.
The unbiased estimates of differential mRNA stability (Δexon–Δintron–bias), relative to the average of all samples.
Δexon- Δintron-bias=t_(1/2)
stability <- read.delim(file="/Users/wede/Documents/University/Major_research_project/rnadyn_test/stability.filtered.mx.txt")
stability_hm <- as.matrix(stability[,-1])
#coul <- colorRampPalette(rev(brewer.pal(8, "RdYlBu")))(26)
pheatmap(stability_hm, scale="row",col=coul)
The heatmap of Pearson similarities of samples with respect to the above estimates.
The scatterplot of Δexon vs. Δintron for all filtered genes in all samples.