rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)
library(MASS)
library(countreg)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
library(dplyr)
library(glmmTMB)
library(car)
## Loading required package: carData
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
rm all.depths Xid.all.depths
cat ../depths_allSamples/*.depths >> all.depths
# awk line to append "X" to id number
awk '{print $1, $2, $3, "X"$4}' all.depths >> Xid.all.depths
head Xid.all.depths
rm sort.clean2.master.res
cat ../sort.clean.Beh01_IDs_AG_TC_master.res | awk '{print $1,$2,$3,$4,$5,$6,$7,"X"$8}' | tr ":" " " | tr ' ' ',' | tr '\t' ',' >> sort.clean2.master.res
head sort.clean2.master.res
# WARNING: requires 39.9hrs run time!
#./supReads_append.sh
awk '{if (NF <= 4) print $0, "NA", 0, 0, 0; else print $0}' yes.final > zeros.yes.final
df.all <- read.csv(file = "Xid.all.depths", header = FALSE)
df.master <- read.csv(file = "sort.clean2.master.res", header = F)
# drop unneeded columns
df.master[,c("V1","V3","V4","V5","V7","V8","V9")] -> df.master
# replaces NA with zero
df.all[is.na(df.all)] <- 0
# column names
colnames(df.all) <- c("Chrom", "Site", "DP", "ID")
colnames(df.master) <- c("Chrom", "Site", "Type", "SupReads", "AD", "DP", "ID")
nrow(df.master)
df.master %>%
filter(Type == "AG" | Type == "TC" & Site != "147947") %>%
filter(ID != "XTC") -> df.master.clean
nrow(df.master.clean)
#accidentally used Pos1 instead of Pos2, just add 1 to correct this
df.all$Site = df.all$Site + 1
write.csv(df.master.clean, file = "df.master.csv", quote = FALSE, row.names = FALSE)
write.csv(df.all, file = "df.all.csv", quote = FALSE, row.names = FALSE)
#View(full_join(df.all, df.master, by = c("ChromSite", "ID")))
yes = read.csv(file = "zeros.yes.final", header = F, sep = " ")
colnames(yes) <- c("Chrom","Site","Depth_mw","Fish_id","Edit_type","SR","AD","DP")
nrow(yes)
yes %>%
filter(Chrom != "AG") %>%
filter(Chrom != "TC" ) %>%
filter(Chrom != "X") -> tmp.yes.forAwk
nrow(tmp.yes.forAwk)
View(tmp.yes.forAwk)
write.csv(tmp.yes.forAwk, file = "tmp.yes.forAwk", quote = F, row.names = FALSE)
head -5 tmp.yes.forAwk
awk -F "," '$5 != "X" {print}' tmp.yes.forAwk > yes.X.remove
wc -l yes.X.remove
head yes.X.remove
Xid = read.csv(file = "Xid.all.depths", header = FALSE)
sortClean2 = read.csv(file = "sort.clean2.master.res", header = FALSE)
#yes.clean = read.csv(file = "yes.X.remove")
#yes.clean %>%
# nrow()
filter_depth_bottom = 10
filter_depth_top = 50000
#modify Fish_id
rm yes.modID
sed 's/X00/oo/g' yes.X.remove | sed 's/X0/o/g' | tr ',' ' ' >> yes.modID
POParray=(oo2 oo4 oo6 oo8 o13 o15 o18 o20 o22 o24 o25 o27 o29 o31 o34 o36 o41 o43 o45 o47)
pop=0
while read line
do
for x in $line
do
tmp[i]=$x
((i++))
done
i=0
for y in "${POParray[@]}"
do
if [ $y == ${tmp[3]} ] # if sample ID (Mxx) is equivalent to POParray sampleID
then
echo ${tmp[@]} AH >> popAppend.yes # then store full line (fields 0-8) and append "AH"
pop=AH
fi
done
if [ $pop != 'AH' ]
then
echo ${tmp[@]} NJ >> popAppend.yes
fi
pop=0
done < yes.modID
PopAppend was modified for environment.
rm env.pop.append.yes
ENVarray=(oo3 oo4 oo5 oo6 o13 o14 o20 o21 o22 o27 o28 o29 o30 o35 o36 o43 o44 o45 o46 o53)
env=0
while read line
do
for x in $line
do
tmp[i]=$x
((i++))
done
i=0
for y in "${ENVarray[@]}"
do
if [ "$y" == "${tmp[3]}" ] # if sample ID (Mxx) is equivalent to POParray sampleID
then
echo ${tmp[@]} P >> env.pop.append.yes # then store full line (fields 0-8) and append "AH"
env=p
fi
done
if [ $env != 'p' ]
then
echo ${tmp[@]} NP >> env.pop.append.yes
fi
env=0
done < popAppend.yes
Also modified for family
declare -A FAM
FAM=( [oo1]=208 [oo2]=211 [oo3]=208 [oo4]=212 [oo5]=207 [oo6]=209 [oo7]=207 [oo8]=207 [o13]=210 [o14]=210 [o15]=210 [o16]=210 [o17]=214 [o18]=204 [o20]=204 [o21]=205 [o22]=214 [o23]=211 [o24]=214 [o25]=219 [o27]=206 [o28]=218 [o29]=217 [o30]=219 [o31]=217 [o32]=219 [o33]=201 [o34]=215 [o35]=201 [o36]=215 [o41]=216 [o42]=203 [o43]=216 [o44]=203 [o45]=216 [o46]=215 [o47]=201 [o48]=204 [o53]=202 [o58]=205 )
while read line # read AG.pop.behO1.master.res, line by line
do
for x in $line
do
tmp[i]=$x
((i++))
done
i=0
for a in ${!FAM[@]} #
do
if [ $a == "${tmp[3]}" ] # if sample ID (Mxx) is equivalent to POParray sampleID
then
echo ${tmp[@]} ${FAM[$a]} >> finalBehO1.AtoIfull.yes # then store full line (fields 0-8) and append "AH"
fi
done
done < env.pop.append.yes
yes.clean.append = read.csv(file = "finalBehO1.AtoIfull.yes", header = F, sep = " ")
colnames(yes.clean.append) <- c("Chrom", "Site", "Depth_mw", "Id", "Type", "Sr", "Ad", "Dp", "Pop", "Env", "Family")
yes.clean.append %>%
filter(Depth_mw <= filter_depth_top) %>%
filter(Depth_mw >= filter_depth_bottom) -> yes.filt
n_points = nrow(yes.filt)
# histogram of DP overlayed Depth_mw
ggplot(data = yes.filt) +
geom_histogram(aes(Depth_mw, colour = "Depth_mw"), binwidth = 5, alpha = 0.2, fill = "red") +
geom_histogram(aes(Dp, colour = "Dp"), fill = "blue", binwidth = 5, alpha = 0.2) +
xlim(0,200) +
ylim(0,20000) +
xlab("Coverage Depth") +
ylab("Number of Sites") +
labs(title = "Dp and Depth_mw", caption = paste("# data points:", n_points)) +
theme_minimal()
#plot(yes.filt$Dp, yes.filt$Depth_mw)
# scatter plot of Depth_mw ~ Dp
ggplot(data = yes.filt) +
geom_point(aes(x = Depth_mw, y = Dp), shape=1, size=1) +
theme_minimal() +
ylim(0,350) +
xlim(0,650) +
xlab("coverage depth counted - mw") +
ylab("Dp - SPRINT")
# filter and sum for AHNP, Sr
yes.filt %>%
filter(Pop == 'AH', Env == 'P') -> tmp
sum(tmp$Sr) -> sr_ah_p
# filter and sum for AHNP, Sr
yes.filt %>%
filter(Pop == 'NJ', Env == 'P') -> tmp
sum(tmp$Sr) -> sr_nj_p
# filter and sum for AHP, Sr
yes.filt %>%
filter(Pop == 'AH', Env == 'NP') -> tmp
sum(tmp$Sr) -> sr_ah_np
# filter and sum for NJNP, Sr
yes.filt %>%
filter(Pop == 'NJ', Env == 'NP') -> tmp
sum(tmp$Sr) -> sr_nj_np
sr_table <- matrix(c(sr_ah_p, sr_ah_np, sr_nj_p, sr_nj_np), ncol = 2, byrow = F)
colnames(sr_table) <- c("AH", "NJ")
rownames(sr_table) <- c("P", "NP")
sr_table <- as.table(sr_table)
sr.df <- as.data.frame(sr_table)
colnames(sr.df) <- c("Env", "Pop", "Sr")
sr.df %>%
select(Pop, Env, Sr) -> sr.df
sr_table
## AH NJ
## P 28151 30721
## NP 32256 31682
prop.table(sr_table)
## AH NJ
## P 0.2292240 0.2501506
## NP 0.2626496 0.2579757
summary(sr_table)
## Number of cases in table: 122810
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 84.93, df = 1, p-value = 3.098e-20
print("all edit sites")
## [1] "all edit sites"
# includes sr=0
interaction.plot(x.factor = factor(yes.filt$Env, levels = c("P", "NP")),
trace.factor = yes.filt$Pop,
response = yes.filt$Sr,
fun = mean,
type="b",
col=c("black","red"), ### Colors for levels of trace var.
pch=c(19, 17), ### Symbols for levels of trace var.
xlab = "predation environment",
ylab = "Average supporting reads per site",
fixed=TRUE, ### Order by factor order in data
leg.bty = "o")
#filter for sr>1
yes.filt %>%
filter(Sr > 0) -> sr1_filt
print("Sr > 0")
## [1] "Sr > 0"
interaction.plot(x.factor = factor(sr1_filt$Env, levels = c("P", "NP")),
trace.factor = sr1_filt$Pop,
response = sr1_filt$Sr,
fun = mean,
type="b",
col=c("black","red"), ### Colors for levels of trace var.
pch=c(19, 17), ### Symbols for levels of trace
fixed=T, ### Order by factor order in data
xlab = "predation environment",
ylab = "Average supporting reads per site",
leg.bty = "o")
sr_lm <- lm(Sr ~ Pop + Env + Pop:Env, data = sr.df)
anova(sr_lm)
sr.aov <- aov(Sr ~ Pop*Env, data = sr.df)
sr.aov2 <- aov(Sr ~ Pop + Env, data = sr.df)
summary(sr.aov)
summary(sr.aov2)
TukeyHSD(sr.aov)
NB2 <- glm.nb(Sr ~ Pop + Env + Pop*Env + offset(log(Dp)),
data = yes.filt,
control=glm.control(maxit=100))