### Download rclone
wget https://downloads.rclone.org/v1.54.1/rclone-v1.54.1-osx-amd64.zip
unzip rclone-v1.54.1-osx-amd64.zip
sed 's/.*/&.Aligned.sortedByCoord.out.cram/' sample_nrs.txt > samples.txt
$rcloneDir="/hpc/hers_en/wdenooijer/pkgs/rclone"
# Check available files
$rcloneDir/rclone lsf --config="$rcloneDir/nygc_sample.conf" nygc_sample:
# Download all files
vi dl.sh
#!/bin/sh
while read p; do
echo $p
/hpc/hers_en/wdenooijer/pkgs/rclone/rclone copy --config="/hpc/hers_en/wdenooijer/pkgs/rclone/nygc_sample.conf" nygc_sample:"$p" ./counutsdata/cram
done < samples.txt
# save
sbatch -t 24:00:00 dl.sh
module load samtools
cd /hpc/hers_en/wdenooijer/rnadyn/data/countsdata/bam
vi cramsam.sh
#!/bin/sh
while read p; do
echo $p
samtools view -b -h -T /hpc/hers_en/ywang/Reference/Homo_sapiens.GRCh38.dna.primary_assembly.fa /hpc/hers_en/wdenooijer/rnadyn/data/countsdata/cram/"$p" > "$p".bam
done < sample_nrs.txt
# Save
sh cramsam.sh
datadir=/hpc/hers_en/wdenooijer/rnadyn/data/
rootdir=/hpc/hers_en/wdenooijer/rnadyn/
cd $datadir
conda create -n cries
conda activate cries
conda install-c bioconda htseq
conda create -n rembrandts
conda activate rembrandts
conda install -c bioconda bioconductor-deseq
conda install -c r r-gplots
module load R
# Header of metadata
cat /hpc/hers_en/ywang/NYGC-MetaData/nygc_rna_pheno.txt | head -1 > samples_metadata.txt
# Samples metadata
while read p; do
echo "$p"
cat /hpc/hers_en/ywang/NYGC-MetaData/nygc_rna_pheno.txt | grep "$p" >> samples_metadata.txt
done < sample_nrs.txt
# Select column of choice
cut -f 7 -d "|" samples_metadata.txt
To prepare files that can be used by REMBRANDTS.
Downloading the GTF file
wget ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.chr.gtf.gz
Creating annotation files (code from CRIES)
# This is the source annotation file
# Note that the annotation file should be in Ensembl format (downloaded from Ensembl FTP)
# Specifically, the chromosome names should lack the "chr" prefix
# Also, the first tag in column 9 must be gene_id, and the third tag must be transcript_id
# Also, the source annotation file must have a transcript_source tage in column 9
input="Homo_sapiens.GRCh38.99.chr.gtf"
# 1. Identify transcripts that are supported by both Ensembl and Havana annotations
# 2. Identify and write constitutive exons, i.e. those that appear in all Ensembl/Havana isoforms of a gene
output="./Homo_sapiens.GRCh38.99.chr.consExons.gtf"
cat $input | grep 'transcript_source "ensembl_havana"' | awk -v FS='\t' '$3=="exon" { exonName=$1":"$4":"$5":"$7; split($9, fields, ";"); geneName=fields[1]; transcriptName=fields[3]; printf("%s\t%s\t%s\n",exonName,geneName,transcriptName); }' | sort | uniq | awk -v FS='\t' '{ eCount[$1]++; tCount[$3]++; exonHost[$1]=$2; if(tCount[$3]==1) gCount[$2]++; } END { for(i in eCount) if(eCount[i]==gCount[exonHost[i]]) { split(i,fields,":"); printf("%s\tensembl_havana\texon\t%s\t%s\t.\t%s\t.\t%s;\n",fields[1],fields[2],fields[3],fields[4],exonHost[i]); } }' > $output
# 1. Identify transcripts that are supported by both Ensembl and Havana annotations
# 2. Find all exons
# 3. Write the coordinate of intronic regions, i.e. the regions that separate two adjacent exons of the same gene
output="./Homo_sapiens.GRCh38.99.chr.Introns.gtf"
cat $input | grep 'transcript_source "ensembl_havana"' | awk -v FS='\t' '$3=="exon" { exonName=$1":"$4":"$5":"$7; split($9, fields, ";"); geneName=fields[1]; transcriptName=fields[3]; printf("%s\t%s\t%s\n",exonName,geneName,transcriptName); }' | sort | uniq | awk -v FS='\t' '{ eCount[$1]++; tCount[$3]++; exonHost[$1]=$2; if(tCount[$3]==1) gCount[$2]++; } END { for(i in eCount) { split(i,fields,":"); printf("%s\tensembl_havana\texon\t%s\t%s\t.\t%s\t.\t%s;\n",fields[1],fields[2],fields[3],fields[4],exonHost[i]); } }' | bedtools sort -i stdin | awk -v FS='\t' '{ if( last_exon[$9]==1 && (last_exon_end[$9]+1)<($4-1) ) printf("%s\t%s\tintron\t%i\t%i\t%s\t%s\t%s\t%s\n",$1,$2,last_exon_end[$9]+1,$4-1,$6,$7,$8,$9); last_exon[$9]=1; last_exon_end[$9]=$5; }' > $output
hisat2 -t -p 16 -x <indexPath> -U <fastqFiles> | samtools sort -T <scratchPath> -o <sampleName>.srt.bam -
samtools view -b -F 1548 -q 30 -o <sampleName>.srt.filtered.bam <sampleName>.srt.bam
rm <sampleName>.srt.bam
Filter for MAPQ
cp samples.txt /hpc/hers_en/wdenooijer/rnadyn/scripts/rnadyn_counts/
sed -i 's/.cram/.cram.bam/' sample_nrs.txt
cd /hpc/hers_en/wdenooijer/rnadyn/scripts/rnadyn_counts
#!/bin/bash
#SBATCH --job-name=count_exons
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=w.denooijer@students.uu.nl
#SBATCH --ntasks=1
#SBATCH --mem=10gb
#SBATCH --cpus-per-task=1
#SBATCH --time=24:00:00
#SBATCH --output=xx.log
#SBATCH --array=1-24
datadir=/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/bam/
rootdir=/hpc/hers_en/wdenooijer/
FILES=( $(cat sample_nrs.txt) ) ### Create a bash array of all the filenames you need
PER_TASK=10
START_NUM=$(( ($SLURM_ARRAY_TASK_ID - 1) * $PER_TASK + 1))
END_NUM=$(( $SLURM_ARRAY_TASK_ID * $PER_TASK ))
for (( run=$START_NUM; run<=END_NUM; run++ )); do
idx=$(($run-1))
filename=${FILES[$idx]}
echo Input file is $filename
htseq-count -m intersection-strict -f bam -t exon -s reverse ${datadir}/${filename} Homo_sapiens.GRCh38.99.chr.consExons.gtf > ${rootdir}/rnadyn/data/countsdata/${filename}.Exons.txt
done
cd countsdata/
for file in *.txt; do cat $file | column -t > $file.tab; done
mkdir counts_bak/
mv *.txt counts_bak/
Must have 4 columns: Label, File, ReadType, Batch
vi make_table.sh
ls *tab | grep -o "HRA-0...." > t1.txt
ls *tab > t2.txt
paste -d "\t" t1.txt t2.txt > metadata.tab
rm t*.txt
# save
sh make_table.sh
rsync -hav ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/metadata.txt ./
On local: Populate tab file with exonic/intronic alternating in column 3, 1s in column 4. Comma delimited. Column names.
rsync -hav metadata.csv ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/data/countsdata/
cat metadata.csv | sed 's/;/\t/g' > metadata.tab
mv metadata.tab ../
“Optimizing read count cutoff at stringency 0.99 …”
“Total correlation is 0.628594966124487”
“Total number of genes is 16008” “Maximum correlation is 0.736020795671085”
“Selected threshold is 10.2163901737”
“Number of remaining genes is 2187”
biasMode Linear: Currently, only available.
cd /hpc/hers_en/wdenooijer/rnadyn/scripts/REMBRANDTS/
conda activate rembrandts
module load R
sh ./REMBRANDTS.sh motor_cortex /hpc/hers_en/wdenooijer/rnadyn/data/countsdata/metadata.tab /hpc/hers_en/wdenooijer/rnadyn/data/countsdata 0.99 linear
rsync -hav ft_gw2hpct01:/hpc/hers_en/wdenooijer/rnadyn/scripts/REMBRANDTS/out/motor_cortex/* ./
cd /hpc/hers_en/wdenooijer/rnadyn/data/countsdata
vi readcounts.sh
#
cut -f 1 -d ' ' CGND-HRA-00055.Aligned.sortedByCoord.out.cram.bam.Exons.txt.tab > exoncounts.txt
for file in *Exons*;
do
awk '{print $2}' $file > x_$file.txt
done
paste x* > exoncounts.txt
rm x*
cut -f 1 -d ' ' CGND-HRA-00055.Aligned.sortedByCoord.out.cram.bam.Introns.txt.tab > introncounts.txt
for file in *Introns*;
do
awk '{print $2}' $file > x_$file.txt
done
paste x* > introncounts.txt
rm x*
#
sh readcounts.sh
rsync