#! /bin/bash
POP=${1};
CHRarray=(2L 2R 3R 3L 4 X);
for CHR in ${CHRarray[*]}; do
for file in ${POP}$CHR.vcf; do
ctabular=${file%%.vcf}.ctabular.vcf
biallelic=${ctabular%%.vcf}.bi.vcf
NsxStr=${biallelic%%.vcf}.NsxStr.txt
NsxPos=${biallelic%%.vcf}.NsxPos.txt
#Nfilt=${biallelic%%.vcf}.Nfilt.vcf
#NfiltNsxPos=${Nfilt%%.vcf}.NsxPos.txt
echo "Removing header and renaming columns...";
tail -n+11 $file | sed 's/#//;s/_realigned_reads//g' > $ctabular
echo "lines passing filer:"
wc -l $ctabular
echo "filtering biallelic SNPs..."
egrep -E -v '[0-9]\/[2-9]|[2-9]\/[0-9]' $ctabular > $biallelic
echo "lines passing filer:"
wc -l $biallelic
echo "Counting residual heterozigous per strain..."
awk '{if(NR == 1){split($0,header); for(i=10; i<=NF; i++){strain[header[i]]=0;};} for(i=10; i<=NF; i++){if ($i ~ /0\/1|1\/0/){strain[header[i]]++}}} END {for(i=10; i<=NF; i++){print header[i],strain[header[i]];}}' $biallelic > $NsxStr;
awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $biallelic > $NsxPos ;
#echo "Filtering by 10Ns or less..."
#awk '{ if (NR >1){Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}} if (Ns<=10){print $0;}}' $biallelic > $Nfilt ;
#echo "lines passing filer:"
#wc -l $Nfilt;
#awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $Nfilt > $NfiltNsxPos
done;
done;
paste ${POP}2L.ctabular.bi.NsxStr.txt ${POP}2R.ctabular.bi.NsxStr.txt ${POP}3L.ctabular.bi.NsxStr.txt ${POP}3R.ctabular.bi.NsxStr.txt ${POP}4.ctabular.bi.NsxStr.txt ${POP}X.ctabular.bi.NsxStr.txt | awk 'BEGIN{print "strain","2L","2R","3L","3R","4","X"}{print $1,$2,$4,$6,$8,$10,$12}' > NsxStrAllChr.txt
setwd("~/Documents/SNPs/SNP_calling/individual_sweden")
library(ggplot2)
#png(filename = "NsxPosHist.png")
par(mfrow=(c(4,2)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
NsxPos<-read.table(paste("sweden_ind",CHR,".ctabular.bi.NsxPos.txt",sep=""),header=FALSE,col.names=c("pos","Ns"))
#hist(NsxPos$Ns, main = paste("Histogram" , CHR)) #quantile(NsxPos$Ns, probs=seq(0,1,0.01), type=3)
print(ggplot(NsxPos, aes(x=Ns))+geom_histogram(binwidth=1, colour="black", fill="white"))
NsxPosCDF<-ecdf(NsxPos$Ns)
print (CHR)
print(NsxPosCDF(0:27))
}
## [1] "2L"
## [1] 0.01233 0.21638 0.33900 0.42974 0.50415 0.56687 0.61918 0.66611
## [9] 0.70823 0.74646 0.78222 0.81627 0.84940 0.88097 0.91040 0.93613
## [17] 0.95778 0.97489 0.98679 0.99422 0.99792 0.99940 0.99990 0.99999
## [25] 1.00000 1.00000 1.00000 1.00000
## [1] "2R"
## [1] 0.01306 0.24918 0.37372 0.45654 0.52128 0.57902 0.62965 0.67571
## [9] 0.71917 0.76005 0.79834 0.83468 0.86935 0.90153 0.92982 0.95349
## [17] 0.97168 0.98444 0.99267 0.99698 0.99905 0.99975 0.99992 0.99999
## [25] 0.99999 0.99999 1.00000 1.00000
## [1] "3R"
## [1] 0.01171 0.24180 0.36717 0.45468 0.52168 0.57620 0.62387 0.66797
## [9] 0.70880 0.74698 0.78331 0.81864 0.85278 0.88520 0.91549 0.94145
## [17] 0.96294 0.97907 0.99001 0.99607 0.99885 0.99977 0.99996 1.00000
## [25] 1.00000 1.00000 1.00000 1.00000
## [1] "3L"
## [1] 0.01386 0.24376 0.37356 0.46391 0.53224 0.58901 0.63742 0.68087
## [9] 0.72204 0.76060 0.79815 0.83449 0.86781 0.89897 0.92712 0.95054
## [17] 0.96954 0.98312 0.99173 0.99658 0.99896 0.99974 0.99995 0.99998
## [25] 0.99999 0.99999 1.00000 1.00000
## [1] "4"
## [1] 0.03366 0.35451 0.51797 0.60819 0.69826 0.76467 0.80591 0.83669
## [9] 0.86429 0.89280 0.91933 0.94071 0.96042 0.97165 0.97756 0.98135
## [17] 0.98362 0.98453 0.98514 0.98575 0.98620 0.98635 0.98681 0.98772
## [25] 0.98878 0.99090 1.00000 1.00000
## [1] "X"
## [1] 0.04799 0.29411 0.42381 0.50512 0.57351 0.63285 0.68717 0.73825
## [9] 0.78737 0.83320 0.87598 0.91337 0.94390 0.96638 0.98178 0.99111
## [17] 0.99602 0.99838 0.99937 0.99972 0.99983 0.99986 0.99988 0.99989
## [25] 0.99991 0.99993 1.00000 1.00000
#dev.off()
#png(filename = "NsxStr.png")
#par(mfrow=(c(4,1)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
#print (CHR)
NsxStr<-read.table(paste("sweden_ind",CHR,".ctabular.bi.NsxStr.txt",sep=""),header=FALSE,col.names=c("Strain","Ns"))
barplot(NsxStr$Ns, main = paste("N's per strain in" , CHR), names.arg=NsxStr$Strain, las=3)
}
#dev.off()