Italian individual strains

#! /bin/bash

POP=${1};

CHRarray=(2L 2R 3R 3L 4 X);
for CHR in ${CHRarray[*]}; do
  for file in ${POP}$CHR.vcf; do
    ctabular=${file%%.vcf}.ctabular.vcf
        biallelic=${ctabular%%.vcf}.bi.vcf
        NsxStr=${biallelic%%.vcf}.NsxStr.txt
        NsxPos=${biallelic%%.vcf}.NsxPos.txt
        #Nfilt=${biallelic%%.vcf}.Nfilt.vcf
        #NfiltNsxPos=${Nfilt%%.vcf}.NsxPos.txt
        echo "Removing header and renaming columns...";
        tail -n+11 $file | sed 's/#//;s/_realigned_reads//g' > $ctabular
        echo "lines passing filer:"
        wc -l $ctabular
        echo "filtering biallelic SNPs..."
        egrep -E -v '[0-9]\/[2-9]|[2-9]\/[0-9]' $ctabular > $biallelic
        echo "lines passing filer:"
        wc -l $biallelic
        echo "Counting residual heterozigous per strain..."
        awk '{if(NR == 1){split($0,header); for(i=10; i<=NF; i++){strain[header[i]]=0;};} for(i=10; i<=NF; i++){if ($i ~ /0\/1|1\/0/){strain[header[i]]++}}} END {for(i=10; i<=NF; i++){print header[i],strain[header[i]];}}' $biallelic > $NsxStr;
        awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $biallelic > $NsxPos ;
        #echo "Filtering by 10Ns or less..."
        #awk '{ if (NR >1){Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}} if (Ns<=10){print $0;}}' $biallelic > $Nfilt ;
        #echo "lines passing filer:"
        #wc -l $Nfilt;
        #awk '{ Ns=0; for(i=10; i<=NF; i++){ if ($i ~ /0\/1|1\/0/){Ns++}}print $2,Ns;}' $Nfilt > $NfiltNsxPos
    done;

done;

paste ${POP}2L.ctabular.bi.NsxStr.txt ${POP}2R.ctabular.bi.NsxStr.txt ${POP}3L.ctabular.bi.NsxStr.txt ${POP}3R.ctabular.bi.NsxStr.txt ${POP}4.ctabular.bi.NsxStr.txt ${POP}X.ctabular.bi.NsxStr.txt | awk 'BEGIN{print "strain","2L","2R","3L","3R","4","X"}{print $1,$2,$4,$6,$8,$10,$12}' > NsxStrAllChr.txt

setwd("~/Documents/SNPs/SNP_calling/individual_italy")
library(ggplot2)

#png(filename = "NsxPosHist.png")
par(mfrow=(c(4,2)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
  NsxPos<-read.table(paste("italian_ind",CHR,".ctabular.bi.NsxPos.txt",sep=""),header=FALSE,col.names=c("pos","Ns"))
  #hist(NsxPos$Ns, main = paste("Histogram" , CHR))  #quantile(NsxPos$Ns, probs=seq(0,1,0.01), type=3)
  print(ggplot(NsxPos, aes(x=Ns))+geom_histogram(binwidth=1, colour="black", fill="white"))
  NsxPosCDF<-ecdf(NsxPos$Ns)
  print (CHR)
  print(NsxPosCDF(0:16))
}

plot of chunk unnamed-chunk-1

## [1] "2L"
##  [1] 0.02052 0.34398 0.50175 0.60993 0.69581 0.76900 0.83239 0.88723
##  [9] 0.93120 0.96373 0.98440 0.99510 0.99922 0.99996 1.00000 1.00000
## [17] 1.00000

plot of chunk unnamed-chunk-1

## [1] "2R"
##  [1] 0.02048 0.33966 0.47886 0.58172 0.66627 0.74120 0.80594 0.86466
##  [9] 0.91358 0.95162 0.97773 0.99264 0.99870 0.99993 0.99999 1.00000
## [17] 1.00000

plot of chunk unnamed-chunk-1

## [1] "3R"
##  [1] 0.01795 0.33214 0.48604 0.59295 0.67852 0.75233 0.81743 0.87398
##  [9] 0.92029 0.95666 0.98082 0.99402 0.99899 0.99995 1.00000 1.00000
## [17] 1.00000

plot of chunk unnamed-chunk-1

## [1] "3L"
##  [1] 0.02761 0.33624 0.48564 0.59045 0.67529 0.74903 0.81364 0.87027
##  [9] 0.91849 0.95501 0.97981 0.99356 0.99888 0.99994 1.00000 1.00000
## [17] 1.00000

plot of chunk unnamed-chunk-1

## [1] "4"
##  [1] 0.02681 0.52219 0.65488 0.74116 0.81505 0.85718 0.88961 0.91394
##  [9] 0.93895 0.95900 0.97567 0.98986 0.99527 0.99617 0.99685 1.00000
## [17] 1.00000

plot of chunk unnamed-chunk-1

## [1] "X"
##  [1] 0.09179 0.40775 0.56436 0.68707 0.79096 0.87393 0.93310 0.97064
##  [9] 0.98964 0.99690 0.99924 0.99977 0.99991 0.99994 0.99997 1.00000
## [17] 1.00000

#dev.off()

#png(filename = "NsxStr.png")
#par(mfrow=(c(4,1)))
for (CHR in c("2L", "2R", "3R", "3L", "4", "X")){
  #print (CHR)
  NsxStr<-read.table(paste("italian_ind",CHR,".ctabular.bi.NsxStr.txt",sep=""),header=FALSE,col.names=c("Strain","Ns"))
  barplot(NsxStr$Ns, main = paste("N's per strain in" , CHR), names.arg=NsxStr$Strain, las=3)
}

plot of chunk unnamed-chunk-2

#dev.off()