DGRP 16 lines as a control for Italian individual strains

#! /bin/bash

POP=${1};

CHRarray=(2L 2R 3R 3L 4 X);
for CHR in ${CHRarray[*]}; do
  for file in ${POP}$CHR.vcf; do
    ctabular=${file%%.vcf}.ctabular.vcf
        biallelic=${ctabular%%.vcf}.bi.vcf
        NsxStr=${biallelic%%.vcf}.NsxStr.txt
        NsxPos=${biallelic%%.vcf}.NsxPos.txt
        #Nfilt=${biallelic%%.vcf}.Nfilt.vcf
        #NfiltNsxPos=${Nfilt%%.vcf}.NsxPos.txt
        echo "Removing header and renaming columns...";
        tail -n+11 $file | sed 's/#//;s/_realigned_reads//g' > $ctabular
        echo "lines passing filer:"
        wc -l $ctabular
        echo "filtering biallelic SNPs..."
        egrep -E -v '[0-9]\/[2-9]|[2-9]\/[0-9]' $ctabular > $biallelic
        echo "lines passing filer:"
        wc -l $biallelic
        echo "Counting residual heterozigous per strain..."
        awk '{if(NR == 1){split($0,header); for(i=10; i<=NF; i++){strain[header[i]]=0;};} for(i=10; i<=NF; i++){if ($i ~ /0\/1|1\/0/){strain[header[i]]++}}} END {for(i=10; i<=NF; i++){print header[i],strain[header[i]];}}' $biallelic > $NsxStr;
        awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $biallelic > $NsxPos ;
        #echo "Filtering by 10Ns or less..."
        #awk '{ if (NR >1){Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}} if (Ns<=10){print $0;}}' $biallelic > $Nfilt ;
        #echo "lines passing filer:"
        #wc -l $Nfilt;
        #awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $Nfilt > $NfiltNsxPos
    done;

done;

paste ${POP}2L.ctabular.bi.NsxStr.txt ${POP}2R.ctabular.bi.NsxStr.txt ${POP}3L.ctabular.bi.NsxStr.txt ${POP}3R.ctabular.bi.NsxStr.txt ${POP}4.ctabular.bi.NsxStr.txt ${POP}X.ctabular.bi.NsxStr.txt | awk 'BEGIN{print "strain","2L","2R","3L","3R","4","X"}{print $1,$2,$4,$6,$8,$10,$12}' > NsxStrAllChr.txt

setwd("~/Documents/SNPs/SNP_calling/DGRP16")
library(ggplot2)

#png(filename = "NsxPosHist.png")
par(mfrow=(c(4,2)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
  NsxPos<-read.table(paste("16lines",CHR,".ctabular.bi.NsxPos.txt",sep=""),header=FALSE,col.names=c("pos","Ns"))
  #hist(NsxPos$Ns, main = paste("Histogram" , CHR))  #quantile(NsxPos$Ns, probs=seq(0,1,0.01), type=3)
  print(ggplot(NsxPos, aes(x=Ns))+geom_histogram(binwidth=1, colour="black", fill="white"))
  NsxPosCDF<-ecdf(NsxPos$Ns)
  print (CHR)
  print(NsxPosCDF(0:16))
}

plot of chunk unnamed-chunk-1

## [1] "2L"
##  [1] 0.9291 0.9951 0.9981 0.9991 0.9995 0.9998 0.9999 1.0000 1.0000 1.0000
## [11] 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000

plot of chunk unnamed-chunk-1

## [1] "2R"
##  [1] 0.9111 0.9935 0.9972 0.9985 0.9991 0.9995 0.9997 0.9999 0.9999 0.9999
## [11] 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000

plot of chunk unnamed-chunk-1

## [1] "3R"
##  [1] 0.6815 0.9568 0.9987 0.9994 0.9997 0.9998 0.9999 1.0000 1.0000 1.0000
## [11] 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000

plot of chunk unnamed-chunk-1

## [1] "3L"
##  [1] 0.9361 0.9939 0.9984 0.9993 0.9996 0.9997 0.9999 0.9999 0.9999 1.0000
## [11] 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000

plot of chunk unnamed-chunk-1

## [1] "4"
##  [1] 0.7732 0.8886 0.9316 0.9490 0.9602 0.9666 0.9714 0.9750 0.9778 0.9828
## [11] 0.9897 0.9981 1.0000 1.0000 1.0000 1.0000 1.0000

plot of chunk unnamed-chunk-1

## [1] "X"
##  [1] 0.9757 0.9920 0.9963 0.9978 0.9985 0.9990 0.9994 0.9996 0.9997 0.9998
## [11] 0.9999 0.9999 1.0000 1.0000 1.0000 1.0000 1.0000

#dev.off()

#png(filename = "NsxStr.png")
#par(mfrow=(c(4,1)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
  #print (CHR)
  NsxStr<-read.table(paste("16lines",CHR,".ctabular.bi.NsxStr.txt",sep=""),header=FALSE,col.names=c("Strain","Ns"))
  barplot(NsxStr$Ns, main = paste("N's per strain in" , CHR), names.arg=NsxStr$Strain, las=3)
}

plot of chunk unnamed-chunk-2

#dev.off()