#! /bin/bash
POP=${1};
CHRarray=(2L 2R 3R 3L 4 X);
for CHR in ${CHRarray[*]}; do
for file in ${POP}$CHR.vcf; do
ctabular=${file%%.vcf}.ctabular.vcf
biallelic=${ctabular%%.vcf}.bi.vcf
NsxStr=${biallelic%%.vcf}.NsxStr.txt
NsxPos=${biallelic%%.vcf}.NsxPos.txt
#Nfilt=${biallelic%%.vcf}.Nfilt.vcf
#NfiltNsxPos=${Nfilt%%.vcf}.NsxPos.txt
echo "Removing header and renaming columns...";
tail -n+11 $file | sed 's/#//;s/_realigned_reads//g' > $ctabular
echo "lines passing filer:"
wc -l $ctabular
echo "filtering biallelic SNPs..."
egrep -E -v '[0-9]\/[2-9]|[2-9]\/[0-9]' $ctabular > $biallelic
echo "lines passing filer:"
wc -l $biallelic
echo "Counting residual heterozigous per strain..."
awk '{if(NR == 1){split($0,header); for(i=10; i<=NF; i++){strain[header[i]]=0;};} for(i=10; i<=NF; i++){if ($i ~ /0\/1|1\/0/){strain[header[i]]++}}} END {for(i=10; i<=NF; i++){print header[i],strain[header[i]];}}' $biallelic > $NsxStr;
awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $biallelic > $NsxPos ;
#echo "Filtering by 10Ns or less..."
#awk '{ if (NR >1){Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}} if (Ns<=10){print $0;}}' $biallelic > $Nfilt ;
#echo "lines passing filer:"
#wc -l $Nfilt;
#awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $Nfilt > $NfiltNsxPos
done;
done;
paste ${POP}2L.ctabular.bi.NsxStr.txt ${POP}2R.ctabular.bi.NsxStr.txt ${POP}3L.ctabular.bi.NsxStr.txt ${POP}3R.ctabular.bi.NsxStr.txt ${POP}4.ctabular.bi.NsxStr.txt ${POP}X.ctabular.bi.NsxStr.txt | awk 'BEGIN{print "strain","2L","2R","3L","3R","4","X"}{print $1,$2,$4,$6,$8,$10,$12}' > NsxStrAllChr.txt
setwd("~/Documents/SNPs/SNP_calling/individual_italy")
library(ggplot2)
#png(filename = "NsxPosHist.png")
par(mfrow=(c(4,2)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
NsxPos<-read.table(paste("italian_ind",CHR,".ctabular.bi.NsxPos.txt",sep=""),header=FALSE,col.names=c("pos","Ns"))
#hist(NsxPos$Ns, main = paste("Histogram" , CHR)) #quantile(NsxPos$Ns, probs=seq(0,1,0.01), type=3)
print(ggplot(NsxPos, aes(x=Ns))+geom_histogram(binwidth=1, colour="black", fill="white"))
NsxPosCDF<-ecdf(NsxPos$Ns)
print (CHR)
print(NsxPosCDF(0:16))
}
## [1] "2L"
## [1] 0.02052 0.34398 0.50175 0.60993 0.69581 0.76900 0.83239 0.88723
## [9] 0.93120 0.96373 0.98440 0.99510 0.99922 0.99996 1.00000 1.00000
## [17] 1.00000
## [1] "2R"
## [1] 0.02048 0.33966 0.47886 0.58172 0.66627 0.74120 0.80594 0.86466
## [9] 0.91358 0.95162 0.97773 0.99264 0.99870 0.99993 0.99999 1.00000
## [17] 1.00000
## [1] "3R"
## [1] 0.01795 0.33214 0.48604 0.59295 0.67852 0.75233 0.81743 0.87398
## [9] 0.92029 0.95666 0.98082 0.99402 0.99899 0.99995 1.00000 1.00000
## [17] 1.00000
## [1] "3L"
## [1] 0.02761 0.33624 0.48564 0.59045 0.67529 0.74903 0.81364 0.87027
## [9] 0.91849 0.95501 0.97981 0.99356 0.99888 0.99994 1.00000 1.00000
## [17] 1.00000
## [1] "4"
## [1] 0.02681 0.52219 0.65488 0.74116 0.81505 0.85718 0.88961 0.91394
## [9] 0.93895 0.95900 0.97567 0.98986 0.99527 0.99617 0.99685 1.00000
## [17] 1.00000
## [1] "X"
## [1] 0.09179 0.40775 0.56436 0.68707 0.79096 0.87393 0.93310 0.97064
## [9] 0.98964 0.99690 0.99924 0.99977 0.99991 0.99994 0.99997 1.00000
## [17] 1.00000
#dev.off()
#png(filename = "NsxStr.png")
#par(mfrow=(c(4,1)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
#print (CHR)
NsxStr<-read.table(paste("italian_ind",CHR,".ctabular.bi.NsxStr.txt",sep=""),header=FALSE,col.names=c("Strain","Ns"))
barplot(NsxStr$Ns, main = paste("N's per strain in" , CHR), names.arg=NsxStr$Strain, las=3)
}
#dev.off()