rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)

library(MASS)
library(countreg)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(dplyr)
library(glmmTMB)
library(car)
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some

rm all.depths Xid.all.depths

cat ../depths_allSamples/*.depths >> all.depths 

# awk line to append "X" to id number
awk '{print $1, $2, $3, "X"$4}' all.depths >> Xid.all.depths

head Xid.all.depths
rm sort.clean2.master.res

cat ../sort.clean.Beh01_IDs_AG_TC_master.res | awk '{print $1,$2,$3,$4,$5,$6,$7,"X"$8}' | tr ":" "   " | tr ' ' ',' | tr '\t' ',' >> sort.clean2.master.res

head sort.clean2.master.res

# WARNING: requires 39.9hrs run time!
#./supReads_append.sh

awk '{if (NF <= 4) print $0, "NA", 0, 0, 0; else print $0}' yes.final > zeros.yes.final

Read in data

df.all <- read.csv(file = "Xid.all.depths", header = FALSE)
df.master <- read.csv(file = "sort.clean2.master.res", header = F)

# drop unneeded columns
df.master[,c("V1","V3","V4","V5","V7","V8","V9")] -> df.master

# replaces NA with zero  
df.all[is.na(df.all)] <- 0

# column names
colnames(df.all) <- c("Chrom", "Site", "DP", "ID")
colnames(df.master) <- c("Chrom", "Site", "Type", "SupReads", "AD", "DP", "ID")

nrow(df.master)

df.master %>%
  filter(Type == "AG" | Type == "TC" & Site != "147947") %>%
  filter(ID != "XTC") -> df.master.clean
  
nrow(df.master.clean)

#accidentally used Pos1 instead of Pos2, just add 1 to   correct this
df.all$Site = df.all$Site + 1

write.csv(df.master.clean, file = "df.master.csv", quote = FALSE, row.names = FALSE)

write.csv(df.all, file = "df.all.csv", quote = FALSE, row.names = FALSE)

#View(full_join(df.all, df.master, by = c("ChromSite", "ID")))

yes = read.csv(file = "zeros.yes.final", header = F, sep = " ")

colnames(yes) <- c("Chrom","Site","Depth_mw","Fish_id","Edit_type","SR","AD","DP")

nrow(yes)

yes %>%
  filter(Chrom != "AG") %>%
  filter(Chrom != "TC" ) %>% 
  filter(Chrom != "X") -> tmp.yes.forAwk

nrow(tmp.yes.forAwk)

View(tmp.yes.forAwk)

write.csv(tmp.yes.forAwk, file = "tmp.yes.forAwk", quote = F, row.names = FALSE)

head -5 tmp.yes.forAwk

awk -F "," '$5 != "X" {print}' tmp.yes.forAwk > yes.X.remove

wc -l yes.X.remove

head yes.X.remove
Xid = read.csv(file = "Xid.all.depths", header = FALSE)

sortClean2 = read.csv(file = "sort.clean2.master.res", header = FALSE)

#yes.clean = read.csv(file = "yes.X.remove")

#yes.clean %>%
#  nrow()

filter_depth_bottom = 10
filter_depth_top = 50000

#modify Fish_id
rm yes.modID

sed 's/X00/oo/g' yes.X.remove | sed 's/X0/o/g' | tr ',' '  ' >> yes.modID


POParray=(oo2 oo4 oo6 oo8 o13 o15 o18 o20 o22 o24 o25 o27 o29 o31 o34 o36 o41 o43 o45 o47)

pop=0

while read line
do
    for x in $line
    do
        tmp[i]=$x
        ((i++))        
    done
    
    i=0
      
    for y in "${POParray[@]}"
    do
          if [ $y == ${tmp[3]} ]  # if sample ID (Mxx) is equivalent to POParray sampleID 
          then
              echo ${tmp[@]} AH >> popAppend.yes # then store full line (fields 0-8) and append "AH"
              pop=AH
          fi
    done
    
    if [ $pop != 'AH' ]
    then
        echo ${tmp[@]} NJ >> popAppend.yes
    fi
    
    pop=0
    
done < yes.modID
  
Script for env append: envAppend.sh

PopAppend was modified for environment.


rm env.pop.append.yes

ENVarray=(oo3 oo4 oo5 oo6 o13 o14 o20 o21 o22 o27 o28 o29 o30 o35 o36 o43 o44 o45 o46 o53)

env=0

while read line
do
    for x in $line
    do
        tmp[i]=$x
        ((i++))        
    done
    
    i=0
      
    for y in "${ENVarray[@]}"
    do
          if [ "$y" == "${tmp[3]}" ]  # if sample ID (Mxx) is equivalent to POParray sampleID 
          then
              echo ${tmp[@]} P >> env.pop.append.yes # then store full line (fields 0-8) and append "AH"
              env=p
          fi
    done
    
    if [ $env != 'p' ]
    then
        echo ${tmp[@]} NP >> env.pop.append.yes 
    fi
    
    env=0
    
done < popAppend.yes
  
Script for family append: famAppend.sh

Also modified for family


declare -A FAM

FAM=( [oo1]=208 [oo2]=211 [oo3]=208 [oo4]=212 [oo5]=207 [oo6]=209 [oo7]=207 [oo8]=207 [o13]=210 [o14]=210 [o15]=210 [o16]=210 [o17]=214 [o18]=204 [o20]=204 [o21]=205 [o22]=214 [o23]=211 [o24]=214 [o25]=219 [o27]=206 [o28]=218 [o29]=217 [o30]=219 [o31]=217 [o32]=219 [o33]=201 [o34]=215 [o35]=201 [o36]=215 [o41]=216 [o42]=203 [o43]=216 [o44]=203 [o45]=216 [o46]=215 [o47]=201 [o48]=204 [o53]=202 [o58]=205 )


while read line                  # read AG.pop.behO1.master.res, line by line
do
    for x in $line
    do
        tmp[i]=$x
        ((i++))        
    done
    
    i=0
      
    for a in ${!FAM[@]}         # 
    do

        if [ $a == "${tmp[3]}" ]            # if sample ID (Mxx) is equivalent to POParray sampleID 
        then

            echo ${tmp[@]} ${FAM[$a]} >> finalBehO1.AtoIfull.yes # then store full line (fields 0-8) and append "AH"

        fi
    done    
done < env.pop.append.yes
yes.clean.append = read.csv(file = "finalBehO1.AtoIfull.yes", header = F, sep = " ")

colnames(yes.clean.append) <- c("Chrom", "Site", "Depth_mw", "Id", "Type", "Sr", "Ad", "Dp", "Pop", "Env", "Family")

yes.clean.append %>%
  filter(Depth_mw <= filter_depth_top) %>%
  filter(Depth_mw >= filter_depth_bottom) -> yes.filt

n_points = nrow(yes.filt)
# histogram of DP overlayed Depth_mw
ggplot(data = yes.filt) +
  geom_histogram(aes(Depth_mw, colour = "Depth_mw"), binwidth = 5, alpha = 0.2, fill = "red") +
  geom_histogram(aes(Dp, colour = "Dp"), fill = "blue", binwidth = 5, alpha = 0.2) +
  xlim(0,200) +
  ylim(0,20000) +
  xlab("Coverage Depth") +
  ylab("Number of Sites") +
  labs(title = "Dp and Depth_mw", caption = paste("# data points:", n_points)) +
  theme_minimal()

#plot(yes.filt$Dp, yes.filt$Depth_mw)
# scatter plot of Depth_mw ~ Dp

ggplot(data = yes.filt) +
  geom_point(aes(x = Depth_mw, y = Dp), shape=1, size=1) +
  theme_minimal() +
  ylim(0,350) +
  xlim(0,650) +
  xlab("coverage depth counted - mw") + 
  ylab("Dp - SPRINT")

# filter and sum for AHNP, Sr
yes.filt %>%
  filter(Pop == 'AH', Env == 'P') -> tmp
  
sum(tmp$Sr) -> sr_ah_p

# filter and sum for AHNP, Sr
yes.filt %>%
  filter(Pop == 'NJ', Env == 'P') -> tmp

sum(tmp$Sr) -> sr_nj_p

# filter and sum for AHP, Sr
yes.filt %>%
  filter(Pop == 'AH', Env == 'NP') -> tmp

sum(tmp$Sr) -> sr_ah_np

# filter and sum for NJNP, Sr
yes.filt %>%
  filter(Pop == 'NJ', Env == 'NP') -> tmp
  
sum(tmp$Sr) -> sr_nj_np

sr_table <- matrix(c(sr_ah_p, sr_ah_np, sr_nj_p, sr_nj_np), ncol = 2, byrow = F)

colnames(sr_table) <- c("AH", "NJ")
rownames(sr_table) <- c("P", "NP")

sr_table <- as.table(sr_table)

sr.df <- as.data.frame(sr_table)

colnames(sr.df) <- c("Env", "Pop", "Sr")

sr.df %>%
  select(Pop, Env, Sr) -> sr.df

sr_table
##       AH    NJ
## P  28151 30721
## NP 32256 31682
prop.table(sr_table)
##           AH        NJ
## P  0.2292240 0.2501506
## NP 0.2626496 0.2579757
summary(sr_table)
## Number of cases in table: 122810 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 84.93, df = 1, p-value = 3.098e-20
print("all edit sites")
## [1] "all edit sites"
# includes sr=0
interaction.plot(x.factor     = factor(yes.filt$Env, levels = c("P", "NP")),
                 trace.factor = yes.filt$Pop,
                 response     = yes.filt$Sr,
                 fun = mean,
                 type="b",
                 col=c("black","red"),  ### Colors for levels of trace var.
                 pch=c(19, 17),             ### Symbols for levels of trace var.
                 xlab = "predation environment",
                 ylab = "Average supporting reads per site",
                 fixed=TRUE,                    ### Order by factor order in data
                 leg.bty = "o")

#filter for sr>1
yes.filt %>%
  filter(Sr > 0) -> sr1_filt


print("Sr > 0")
## [1] "Sr > 0"
interaction.plot(x.factor     = factor(sr1_filt$Env, levels = c("P", "NP")),
                 trace.factor = sr1_filt$Pop,
                 response     = sr1_filt$Sr,
                 fun = mean,
                 type="b",
                 col=c("black","red"),  ### Colors for levels of trace var.
                 pch=c(19, 17),             ### Symbols for levels of trace 
                 fixed=T, ### Order by factor order in data
                 xlab = "predation environment",
                 ylab = "Average supporting reads per site",
                 leg.bty = "o")

sr_lm <- lm(Sr ~ Pop + Env + Pop:Env, data = sr.df)

anova(sr_lm)

sr.aov <- aov(Sr ~ Pop*Env, data = sr.df)
sr.aov2 <- aov(Sr ~ Pop + Env, data = sr.df)

summary(sr.aov)
summary(sr.aov2)

TukeyHSD(sr.aov)

NB2 <- glm.nb(Sr ~ Pop + Env + Pop*Env + offset(log(Dp)), 
              data = yes.filt,
              control=glm.control(maxit=100))