require(poppr)
require(genepop)
require(graph4lg)
require(related)
require(adegenet)
require(knitr)
require(tidyverse)
require(magrittr)

Readme

This is document is an R notebook. If you’d like view to pre-rendered figures, read a summary of analysis and interact with code, please open the relevant html file in a browser.

To conduct a similar analyses on your computer, edit or run code: clone this repository into a directory on your local machine and open the .Rproj file in Rstudio.

Rationale

This is the genotyping log for the 2021 Coquille River O. mykiss project

Data Summary

Sequence Data

Summary
- SFGL Run Number 022 - Gtseq Data using the SFGL Omy392/Omy391 panel:
- Raw data from UofO sequencing center is located at “/dfs/Omalley_Lab/fitz/Runs/4819”
- Arrived demultiplexed, lane also contains Ots data - 91 samples fastqs + 5 controls

  • Controls and Omy data copied to dayan directory for project: “/dfs/Omalley_Lab/dayan/small_omy_projects/coquille_2021/raw_reads”

fastqc

Fastqc report for merged demulitplexed from this project available in repo under genotype_data directory

Everything looked pretty typical for a GTseq run, perhaps a little high on adapter Raw reads: 37178964 Q: >34 until base 60 Adapter Sequence: Zero until base 20, then gradual increase up to 40% by base 40.

Sample Metadata

sheet1 <- readxl::read_xlsx("metadata/GT-seq_GC3F-CKF-005_metadata.xlsx", sheet = 1)
sheet2 <- readxl::read_xlsx("metadata/GT-seq_GC3F-CKF-005_metadata.xlsx", sheet = 2)

meta_data <- sheet1 %>%
  bind_rows(sheet2) %>%
  select(-1) %>%
  mutate(pop = str_sub(`SFGL Id`, 8, 11))

kable(meta_data %>%
  group_by(pop, `Hat/Wild`) %>%
  summarise(n = n()) )
pop Hat/Wild n
NFCQ Hatchery 15
NFCQ Wild 26
SFCQ Hatchery 22
SFCQ Wild 28
kable(meta_data %>%
  group_by(pop, Date) %>%
  summarise(n = n()) )
pop Date n
NFCQ 2016-03-30 15
NFCQ 2016-07-11 10
NFCQ 2016-07-13 16
SFCQ 2016-02-17 7
SFCQ 2016-03-01 10
SFCQ 2016-03-30 12
SFCQ 2016-07-12 21
rm(sheet1)
rm(sheet2)

41 North Fork Coquille River Winter Steelhead (including 15 hatchery and 26 wild) and 50 South Fork Coquille River Winter Steelhead (including 22 hatchery and 28 wild)

Genotyping

Genotype

Main Genotyper

# SERVER

# Decompression script

# note 1: this is a script to save and submit as a job, save everything below the long ########### below

#note 2: the number of threads to use (-t option) is hardcoded to match the number of input files, change this number to reflect how many fastq.gz files you have

################################# save everything below this as a file
#!/bin/bash
#$ -S /bin/bash
#$ -t 1-96
#$ -tc 60
#$ -N decompress
#$ -cwd
#$ -o $JOB_NAME_$TASK_ID.out
#$ -e $JOB_NAME_$TASK_ID.err

FASTQS=(`ls *fastq.gz`)
INFILE=${FASTQS[$SGE_TASK_ID -1]}

gunzip -c $INFILE > ${INFILE%.gz}

#save as script and submit this with qsub -q harold scriptname
####################################
# SERVER 

# Genotyper Script

# note 1: this is a script to save and submit as a job, save everything below the long ########### below

#note 2: the number of threads to use (-t option) is hardcoded to match the number of input files, change this number to reflect how many fastq files you have in the directory

################################# save everything below this as a file
#!/bin/bash
#$ -S /bin/bash

#$ -t 1-96

#$ -tc 96

#$ -N GTseq-genotyperv3

#$ -cwd

#$ -o $JOB_NAME_$TASK_ID.out

#$ -e $JOB_NAME_$TASK_ID.err
export PERL5LIB='/home/fw/dayand/perl5/lib/perl5/x86_64-linux-thread-multi/' #you may want to change this to your own perl lib destination 

FASTQS=(`ls ./*fastq`) #reminder to change the directory to your copy of the fastqs
INFILE=${FASTQS[$SGE_TASK_ID -1]}
OUTFILE=$(basename ${INFILE%.fastq}.genos)

GTSEQ_GENO="/dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/GTseq_Genotyper_v3.1.pl" #again, change this path to your own copy of this script

PROBE_SEQS="/dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/Omy_390_probe_seqs.csv" #change the probe seq file to match whatever panel you are using

perl $GTSEQ_GENO $PROBE_SEQS $INFILE > $OUTFILE

#save this code chunk as a file on the server and submit this with the following command from the directory you want the output .genos files:
# qsub -q harold scriptname 
# note that you might submit to a different -q than harold

Sex Genotyper

After the genotypes are written for the panel, we add the sex genotyper.

SGE_Batch -q harold -r omysex -c 'perl /dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/OmySEX_test_v3.pl'

Compile Genotypes

After all the .genos are written, we compile them into a single output using the GenoCompile script


SGE_Batch -q harold -r compile -c 'perl /dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/GTseq_GenoCompile_v3.pl > coquille_GTs_0.1.csv' 

QAQC

Marker Info

The first step in the QA-QC process is to collect some information about genotying success from the .genos files. We’ll do this with an awk one liner.

The script below will pull the allele count ratios and read counts for all individuals in the pipeline

# SERVER

#run from directory with your .genos (use a SGE_Batch job or interactive shell)

#collect marker info from all the genos files
touch marker_info.csv
echo 'ind,marker,a1_count,a2_count,called_geno,a1_corr,a2_corr' >> marker_info.csv
for file in ./*genos
do
    awk -F"," ' BEGIN { OFS="," } {print FILENAME,$1,$2,$3,$6,$7,$8}' $file >> marker_info.csv
done

# now we'll cleanup this file so that it is easier to work with
sed -i '/Raw-Reads/d' ./marker_info.csv #first get rid of genos headers
sed -i '/negative/d' ./marker_info.csv # then get rid of controls
sed -i '/positive/d' ./marker_info.csv # then get rid of controls
sed -i '/Summer/d' ./marker_info.csv
sed -i '/Het/d' ./marker_info.csv
sed -i '/Winter/d' ./marker_info.csv

Read in the marker info file and clean it up a little more. Note you’ll have to transfer the file off the server for this.

#LOCAL R

marker_info <- read_csv("genotype_data/marker_info.csv")

#this part changes the values of A=2, G=898, -=52, etc for the allele count columns to the actual values, and gets rid of some mess in the sample names (ind)
marker_info %<>%
  mutate(a1_count =  as.numeric(substr(a1_count, 3, nchar(a1_count)))) %>%
  mutate(a2_count =  as.numeric(substr(a2_count, 3, nchar(a2_count)))) %>%
  mutate(ind = str_remove(ind, "^\\./")) %>%
  mutate(ind = str_remove(ind, "\\.genos")) %>%
  filter(!(is.na(marker)))
fivefour <- filter(marker_info, marker =="Omy_RAD47080-54")
five3 <- filter(marker_info, marker =="Omy_RAD15709-53")

fivefour %<>% 
  mutate(a1_count = case_when(a1_count == 0 ~ 1,
                              TRUE ~ a1_count)) %>%
  mutate(a2_count = case_when(a2_count == 0 ~ 1,
                               TRUE ~ a2_count)) %>%
  mutate(allele_ratio_54 = a1_count/a2_count) %>%
  rename(called_geno_54 = called_geno)


five3 %<>% 
  mutate(a1_count = case_when(a1_count == 0 ~ 1,
                               TRUE ~ a1_count)) %>%
  mutate(a2_count = case_when(a2_count == 0 ~ 1,
                               TRUE ~ a2_count)) %>%
  mutate(allele_ratio_53 = a2_count/a1_count) %>%#different strands so flip the ratios around
  rename(called_geno_53 = called_geno)
  
comp <- select(fivefour, ind, allele_ratio_54, called_geno_54) %>%
  left_join(select(five3, ind, allele_ratio_53, called_geno_53)) %>% #switch the called geno around for one of these because the probe seq is different strand 
  mutate(called_geno_54 = case_when(called_geno_54 == "A1HOM" ~ "A2HOM",
                                    called_geno_54 == "A2HOM" ~ "A1HOM",
                                    TRUE ~ called_geno_54)) %>%
  mutate(agree = called_geno_54 == called_geno_53)

ggplot(data = comp)+geom_point(aes(log(allele_ratio_54), log(allele_ratio_53), color = agree))+theme_classic()+geom_abline(aes(slope =1, intercept = 0))

Controls

First let’s check that the controls worked well. We will check that negative controls have much fewer reads than average (there may be some on-target reads from othr samples due to index sequence error)

# LOCAL R

# First we are going to prep the raw genotype data for filtering.

# read the raw genotypes file in to R
genos_0.1 <- read_csv("genotype_data/coquille_GTs_0.1.csv")

# add a field to mark controls
# here controls contained "positive," "negative" in their sample names so used simple pattern matching to create a new column, you can add you own here for known controls (e.g. known winter run steelhead)
genos_0.1 %<>%
  mutate(control = case_when(str_detect(Sample, "positive") ~ "positive",
                             str_detect(Sample, "negative") ~ "negative",
                             str_detect(Sample, "Het") ~ "het",
                             TRUE ~ "sample"))

# clean up sample name field
# split the sample name and the adapter sequence (note that replicates will have the same sample name, but we'll keep track with the adapter sequences)

genos_0.1 %<>%
  mutate(adapter = str_extract(Sample, "[ATCG]{6}_[ATCG]{6}")) %>%
  mutate(sample_simple = str_extract(Sample, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")) %>%
  relocate(Sample, sample_simple, adapter)

# great, prep is done, now lets make our first plot: distribution of reads between controls and samples
ggplot()+geom_histogram(data = genos_0.1, aes(x = `On-Target Reads`, fill= control)) + theme_classic()+scale_fill_viridis_d()

A lot of failed samples in this run (~15), but controls look good.

#LOCAL R
ggplot()+geom_histogram(data = genos_0.1, aes(x = `%On-Target`, fill= control)) + theme_classic()+scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Replicates

No replicates in this data.

#LOCAL R

# here we filter out our known controls and create our next dataset genos_0.11
genos_0.11 <- genos_0.1 %>%
  filter(control == "sample") %>%
  select(-control) #get rid of this column

# sometimes we'll use more than a single replicate, this broke a previous version of this code chunk, for now we'll find the triplicate (or more) samples, and keep the two with greatest on target read depth (ignoring triplicates quadruplicates etc)
 
genos_0.11 %<>% 
  group_by(sample_simple) %>%
  slice_max(order_by = `On-Target Reads`, n = 2)

#now let's get duplicated samples
#dups <- genos_0.11[genos_0.11$sample_simple %in% genos_0.11$sample_simple[duplicated(genos_0.11$sample_simple)],]
#dups <- dups[order(dups$sample_simple),]

# next we'll calculate the percent concordance among replicates
# woof I don't see a good way around using a nested for loop here, maybe fix this in the future

#dups_genos <- dups[,9:ncol(dups)] 
#rep_info <- matrix(ncol=ncol(dups_genos), nrow=nrow(dups_genos)/2)
#colnames(rep_info) <- colnames(dups_genos)
#for (j in 1:(nrow(dups_genos)/2)) {
#for (i in 1:ncol(dups_genos)) {
#  rep_info[j,i] <- sum(dups_genos[(j*2)-1,i]==dups_genos[(j*2),i])
#}
#  }

#geno_concordance <- as.data.frame(as.matrix(rep_info)) %>%
#  rowMeans()

#rep_data <- as.data.frame(cbind(dups[c(1:length(geno_concordance))*2,1], geno_concordance))
#ggplot(data=rep_data)+geom_histogram(aes(x=geno_concordance))+theme_classic()

Replication Summary
Again this part of the SOP isnt doing anything because there are no replicates … Next let’s make the 0.2 dataset (i.e. remove the replicates with lower GT success).

# LOCAL R

#this writes a new dataset (0.2) by choosing the samples within duplicates and keeping the one with the highest genotyping success
genos_0.2 <- genos_0.11 %>%
  group_by(sample_simple) %>%
  filter(`On-Target Reads` == max(`On-Target Reads`))

Sex Ratios

If you are using the OmySEX and OtsSEX scripts to call sex genotypes, read this section. If else, skip it, and move on to filtering.

The OmySEX and OtsSEX scripts rely on hardcoded estimates of the proportion of reads dedicated to the sex marker to call sex genotypes. This can sometimes go awry if the sex marker does not amplify as expected during the library prep (e.g. primers are aging, proportion of primers in the primer pool is off, etc).

In this portion of the SOP we will check if the sex genotyping script worked well, and if not, apply a correction.

Sex Script Check

First, let’s check if the script worked correctly.

#LOCAL R

# Plot sex marker counts

sex_marker_info <- marker_info %>%
  filter(str_detect(marker, "SEXY")) %>%
  mutate(called_geno = replace_na(called_geno, "00")) %>%
  mutate(called_geno = case_when(called_geno == "A1HOM" ~ "XX",
                                 called_geno == "HET" ~ "XY",
                                 called_geno == "00" ~ "00"))
  
ggplot(data = sex_marker_info)+geom_point(aes(a2_count, a1_count, color = called_geno))+scale_colour_viridis_d(name = "Called Sex Genotype")+theme_classic()+xlab("Y-specific probe count")+ylab("Theoretical X chromosome count")+geom_abline(aes(intercept = 0, slope = 0.1))+geom_abline(aes(intercept = 0, slope = 5))+geom_abline(aes(intercept = 0, slope = 10))+geom_abline(aes(intercept = 0, slope = 0.2))+xlim(0,max(c(sex_marker_info$a1_count, sex_marker_info$a2_count)))+ylim(0,max(c(sex_marker_info$a1_count, sex_marker_info$a2_count)))+geom_abline(aes(intercept = 0, slope = 1), color = "darkred")

kable(sex_marker_info %>% count(called_geno), caption = "Called Sex Ratio")
Called Sex Ratio
called_geno n
00 15
XX 38
XY 38

Yes needs correction.

Sex Genotype Correction

Here, instead of relying on a harcoded value for the proportion of reads dedicated to the sex marker, we will attempt to estimate this value from the empirical data, and use this estimate to re-call sex genotypes.

#LOCAL R

# First let's collect the number of on-target reads for each sample in the sex_marker_info dataframe
sex_marker_info$sample <- str_extract(sex_marker_info$ind, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")
sex_marker_info$adapter <- str_extract(sex_marker_info$ind, "[ATCG]{6}_[ATCG]{6}") 

sex_marker_info %<>%
  left_join(dplyr::select(genos_0.1, sample_simple, adapter, `On-Target Reads`), by = c("sample" = "sample_simple" , "adapter" = "adapter"))

# now here's the tricky part. we're going to do a first pass filter to eliminate any samples that coldn't possibly be males, then estimate the theoretical number of X chromosome counts for possible males
#first get the estimate
Xmale_proportion_error <- sex_marker_info %>%
  filter(a2_count > 1) %>% #this value is set to 1 if the actual count is zero in the omysexscript, so here we're are eliminating all individuals where it woulbe extremely unlikely that they are males 
  summarise(Xmale_proportion = mean(a2_count/`On-Target Reads`))

#then use the estimate to correct the data
sex_marker_info %<>%
  mutate(XY_count_estimate = Xmale_proportion_error[[1]]*`On-Target Reads`) %>%
  mutate(corrected_ratio = XY_count_estimate/a2_count) %>%
  mutate(corrected_sex_genotype = case_when(corrected_ratio > 10 ~ "XX",
                                            corrected_ratio <= 0.1 ~ "XY",
                                            corrected_ratio <= 0.2 ~ "00",
                                            corrected_ratio <= 5 ~ "XY",
                                            TRUE ~ "00"
                                            )) #note these ratios are taken directly from the omysex script but this seems like another bug, shouldn't XY < 0.2 be NA not XY???

Now we’ll plot again and see if this correction worked as expected.

#LOCAL R

ggplot(data = sex_marker_info)+geom_point(aes(a2_count, XY_count_estimate, color = corrected_sex_genotype), alpha = 0.7)+scale_colour_viridis_d(name = "Corrected Called\nSex Genotype")+theme_classic()+xlab("Y-specific probe count")+ylab("Corrected Theoretical X chromosome count")+geom_abline(aes(intercept = 0, slope = 0.1))+geom_abline(aes(intercept = 0, slope = 5))+geom_abline(aes(intercept = 0, slope = 10))+geom_abline(aes(intercept = 0, slope = 0.2))+xlim(0,max(c(sex_marker_info$XY_count_estimate, sex_marker_info$a2_count)))+ylim(0,max(c(sex_marker_info$XY_count_estimate, sex_marker_info$a2_count)))+geom_abline(aes(intercept = 0, slope = 1), color = "darkred")

kable(sex_marker_info %>% 
        group_by(corrected_sex_genotype) %>%
        summarise(count = n(), mean_depth = mean(`On-Target Reads`)), caption = "Corrected Called Sex Ratio")
Corrected Called Sex Ratio
corrected_sex_genotype count mean_depth
00 5 11194.40
XX 36 137968.31
XY 50 77497.84

This correction looks like it worked: Called males follow a 1:1 line and there are very few uncalled sex genotypes, except for samples with very low total on-target reads. Also the overall sex ratio of this sample is closer to our a priori expectations.

Finally let’s correct the sex genotypes in the data.

#LOCAL R

# run this command for Omy
genos_0.2 %<>%
  left_join(select(sex_marker_info, sample, adapter, corrected_sex_genotype), by = c("sample_simple"="sample", "adapter"="adapter")) %>%
  mutate(OmyY1_2SEXY = corrected_sex_genotype) %>%
  select(-corrected_sex_genotype)

Filtering

Control and replicates have been removed, now it’s time for filtering.

Filtering Summary
We take an iterative approach to filtering:

First remove worst individuals and genotypes: - GTperc_cutoff=30 (indivudals greater than 30% missing data excluded) - Missingness (loci) > 50% (loci with total missing data > 50% removed) - IFI_cutoff = 10 (i.e. >10% background reads)

Then recalculate missingness and IFI - IFI_cutoff=2.5
- GTperc_cutoff=90 (inds greater than 10% missing data excluded)
- Missingness (loci) > 20%

Then examine for paralogues among markers with
- Missingness (loci) > 10% - examine for allele correction issues
- Markers where heterozygotes and “in-betweeners” do not follow 1:1 ratio of allele counts - Markers with high variance in ratio of allele counts at heteroyzgotes and “in-betweeners” - Remove monomorphic SNPs

IFI and Missingness

First we filter individuals and loci on IFI, and missingness.

Let’s take a look at the distribution of these values before any filtering

#LOCAL R

ggplot(genos_0.2)+geom_histogram(aes(x=IFI))+geom_vline(aes(xintercept= 2.5), color="red")+theme_classic()

ggplot(genos_0.2)+geom_histogram(aes(x=`%GT`))+geom_vline(aes(xintercept= 90), color="red")+theme_classic()

missingness <- (colSums(genos_0.2[,c(9:ncol(genos_0.2))] == "00" | genos_0.2[,c(8:(ncol(genos_0.2)-1))] == "0"))/nrow(genos_0.2) #warning hardcoding: "[,8:398]" is hardcoded to work on the example script using the Omy panel with 390 markers, these values will need to be changed to reflect the genotype columns of the genos r object that YOU are running. This excludes columns with metadata and genotyping results such as "sample name" "ifi" "on-target reads" etc
missing <- as.data.frame(missingness)
missing$marker <- row.names(missing)
ggplot(missing) + geom_histogram(aes(x=missingness))+geom_vline(aes(xintercept= 0.2), color="red")+geom_vline(aes(xintercept= 0.1), color="blue")+theme_classic()+xlab("missingness (loci)")

0.3: Extremely Bad Loci and Individuals Excluded

First remove the individuals and markers that clearly failed to genotype correctly (one step at a time)

#print table of bad missingness individual
kable(genos_0.2 %>%
  filter(`%GT` < 70) %>%
    select(2,4,5,6,7), caption = paste(nrow(genos_0.2 %>% filter(`%GT` < 70)), "Individuals with high missingess (>30% missing data)"))
15 Individuals with high missingess (>30% missing data)
sample_simple Raw Reads On-Target Reads %On-Target %GT
OmyAC16SFCQ_0009 303860 265 0.09 0.00
OmyAC16SFCQ_0016 388263 2202 0.57 13.59
OmyAC16SFCQ_0017 219946 209 0.10 0.00
OmyJC16NFCQ_0008 359953 518 0.14 0.00
OmyJC16NFCQ_0011 316462 279 0.09 0.00
OmyJC16NFCQ_0018 364362 2079 0.57 18.46
OmyJC16NFCQ_0033 255828 1567 0.61 9.49
OmyJC16NFCQ_0037 252004 241 0.10 0.00
OmyJC16SFCQ_0026 199637 254 0.13 0.00
OmyJC16SFCQ_0027 259771 424 0.16 0.51
OmyJC16SFCQ_0034 202708 483 0.24 0.51
OmyJC16SFCQ_0038 176997 4206 2.38 39.23
OmyJC16SFCQ_0041 318705 9208 2.89 68.97
OmyJC16SFCQ_0043 240443 2732 1.14 22.31
OmyJC16SFCQ_0045 200057 1044 0.52 3.85
# now remove them
genos_0.3 <- genos_0.2 %>%
  filter(`%GT` > 70)

#now recalculate locus level missingness after removing the worst individuals
  
missingness2 <- (colSums(genos_0.3[,c(9:(ncol(genos_0.3)))] == "00" | genos_0.3[,c(9:(ncol(genos_0.3)))] == "0"))/nrow(genos_0.3) 
missing2 <- as.data.frame(missingness2)
missing2$marker <- row.names(missing2)

#then remove these markers
# collect bad markers
very_bad_markers <- missing2[missing2$missingness2>0.5, 2]
print(paste(length(very_bad_markers), "markers with > 50% missing data"))
## [1] "2 markers with > 50% missing data"
#write the new dataset
genos_0.3 <- genos_0.3 %>%
  dplyr::select(-one_of(very_bad_markers))

#then recalculate IFI
# IFI is equal to the percentage of "background" reads to homozygote reads. Two types of reads contribute to background count: (1) Reads from the alternative allele when an individual has been called as homozygote at a locus, and (2) reads from the less frequent allele when the individual has been called as "in-betweener". We update the IFI score by including only markers in the filtered dataset


IFI <- marker_info %>%
  filter(marker %in% colnames(genos_0.3)) %>%
  group_by(ind) %>%
  summarize(back_count = sum(a1_count[called_geno == "A2HOM"], na.rm = TRUE)
            + sum(a2_count[called_geno == "A1HOM"], na.rm = TRUE)
            + sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
            + sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
            
            hom_ct = sum(a1_count[called_geno == "A1HOM"], na.rm = TRUE)
            + sum(a2_count[called_geno == "A2HOM"], na.rm = TRUE)
            + sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
            + sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
            
            ifi2 = (back_count/hom_ct)*100)

# the "marker_info" file we produced earlier used the filename of the genos file as the sample name (column name "ind"), but the sample names in our local R dataframes are very cleaned up (see line 504). Here I attempt to do the same using some regex in R using the standardized codes for sample naming at SFGL, but note that depending on how your fastq files are named, these exact matches may not work for you
# until we find a better solution I suggest two alternatives if this regex below breaks
# 1: if the number of high IFI samples is very low, just write the sample names out manually to a vector and use this to filter
# 2:

IFI$sample <- str_extract(IFI$ind, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")
IFI$adapter <- str_extract(IFI$ind, "[ATCG]{6}_[ATCG]{6}") 


genos_0.3 <- genos_0.3 %>%
  left_join(select(IFI, sample, adapter, ifi2), by = c("sample_simple" = "sample", "adapter" = "adapter")) %>%
  mutate(IFI = ifi2) %>%
  select(-one_of("ifi2"))

# now filter on IFI
#print table of bad IFI samples
kable(genos_0.3 %>%
  filter(IFI >10) %>%
    select(2:7), caption = "Extreme High IFI (>10) samples (low confidence barcodes)")
Extreme High IFI (>10) samples (low confidence barcodes)
sample_simple adapter Raw Reads On-Target Reads %On-Target %GT
#update the  dataset
genos_0.3 <- genos_0.3 %>%
  filter(IFI < 10)

Filtering log 0.2 -> 0.3:
15 high missingness inds 2 markers high missingness 0 IFI

0.4 Second Iteration Filter

Next we do the same process, but at the final filtering levels:

  • IFI_cutoff=2.5
  • GTperc_cutoff=90 (inds greater than 10% missing data excluded)
  • Missingness (loci) > 20%
#print table of bad missingness individual
kable(genos_0.3 %>%
  filter(`%GT` < 90) %>%
    select(2,4,5,6,7,8), caption = paste(nrow(genos_0.3 %>% filter(`%GT` < 90)), "Individuals with high missingess (>10% missing data)"))
3 Individuals with high missingess (>10% missing data)
sample_simple Raw Reads On-Target Reads %On-Target %GT IFI
OmyAC16SFCQ_0005 228966 24288 10.61 87.18 0.3561465
OmyAC16SFCQ_0006 227920 13695 6.01 74.36 2.8393758
OmyJC16SFCQ_0024 384926 45415 11.80 79.49 0.6486706
# now remove them
genos_0.4 <- genos_0.3 %>%
  filter(`%GT` > 90)

#now recalculate locus level missingness after removing the worst individuals
  
missingness3 <- (colSums(genos_0.4[,c(9:(ncol(genos_0.4)))] == "00" | genos_0.4[,c(9:(ncol(genos_0.4)))] == "0"))/nrow(genos_0.4) 
missing3 <- as.data.frame(missingness3)
missing3$marker <- row.names(missing3)

#then remove these markers
# collect bad markers
bad_markers <- missing3[missing3$missingness3>0.2, 2]
print(paste(length(bad_markers), "markers with > 20% missing data"))
## [1] "6 markers with > 20% missing data"
#write the new dataset
genos_0.4 <- genos_0.4 %>%
  dplyr::select(-one_of(bad_markers))

#then recalculate IFI
# IFI is equal to the percentage of "background" reads to homozygote reads. Two types of reads contribute to background count: (1) Reads from the alternative allele when an individual has been called as homozygote at a locus, and (2) reads from the less frequent allele when the individual has been called as "in-betweener"

IFI <- marker_info %>%
  filter(marker %in% colnames(genos_0.4)) %>%
  group_by(ind) %>%
  summarize(back_count = sum(a1_count[called_geno == "A2HOM"], na.rm = TRUE)
            + sum(a2_count[called_geno == "A1HOM"], na.rm = TRUE)
            + sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
            + sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
            
            hom_ct = sum(a1_count[called_geno == "A1HOM"], na.rm = TRUE)
            + sum(a2_count[called_geno == "A2HOM"], na.rm = TRUE)
            + sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
            + sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
            
            ifi2 = (back_count/hom_ct)*100)

# the "marker_info" file we produced earlier used the filename of the genos file as the sample name (column name "ind"), but the sample names in our local R dataframes are very cleaned up (see line 504). Here I attempt to do the same using some regex in R using the standardized codes for sample naming at SFGL, but note that depending on how your fastq files are named, these exact matches may not work for you
# until we find a better solution I suggest two alternatives if this regex below breaks
# 1: if the number of high IFI samples is very low, just write the sample names out manually to a vector and use this to filter
# 2: 

IFI$sample <- str_extract(IFI$ind, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")
IFI$adapter <- str_extract(IFI$ind, "[ATCG]{6}_[ATCG]{6}") 


genos_0.4 <- genos_0.4 %>%
  left_join(select(IFI, sample, adapter, ifi2), by = c("sample_simple" = "sample", "adapter" = "adapter")) %>%
  mutate(IFI = ifi2) %>%
  select(-one_of("ifi2"))

# now filter on IFI
#print table of bad IFI samples
kable(genos_0.4 %>%
  filter(IFI >2.5) %>%
    select(2:7), caption = "High IFI (>2.5) samples (low confidence barcodes)")
High IFI (>2.5) samples (low confidence barcodes)
sample_simple adapter Raw Reads On-Target Reads %On-Target %GT
OmyJC16NFCQ_0042 TGACCA_TAGTAT 238493 49634 20.81 91.03
OmyJC16NFCQ_0044 TGACCA_TGCTTA 595310 236707 39.76 91.03
#update the  dataset
genos_0.4 <- genos_0.4 %>%
  filter(IFI < 2.5)

0.3 -> 0.4 Filtering Log

Filtered out:
3 individuals with <90% genotying success (i.e. greater than 10% missing data)
6 markers with > 20% missingness
2 contaminated samples (note here that all the samples with high IFI are already removed by the individual level missingness step in the example data)

0.5: Removing Paralogs

Now we manually examine allele counts for markers that may tag paralogues regions. Because our panels can contain hundreds of loci, we flag three types of markers for close scrutiny (below), but this is informal and you can also look at any marker you want using some of the scripts below.
- Missingness (loci) > 10% - examine for allele correction issues
- Markers where heterozygotes and “in-betweeners” do not follow 1:1 ratio of allele counts - Markers with high variance in ratio of allele counts at heteroyzgotes and “in-betweeners”

Let’s collect these markers, first markers with high missingness (10-20% missingness)

# Local R

#get marker names of markers with 0.1 > missingness > 0.2
miss0.1 <- missing3[missing3$missingness3 > 0.1,]
miss_mod <- miss0.1[miss0.1$missingness3 < 0.2, 2]

Next, markers with skewed allele count ratios and allele ratios with high variance. We do this by fitting a linear model between allele 1 counts and allele 2 counts and then flagging markers with a ratio of > 1.5 (3/2) and less than 2/3. We also flag markers where the fit

library(lme4)
hets <- filter(marker_info, called_geno == "HET" | is.na(called_geno))

models <- hets %>%
  filter(marker %in% colnames(genos_0.4)) %>%
  filter(is.na(a1_count) == FALSE & is.na(a2_count) == FALSE) %>%
  group_by(marker) %>%
  group_map(~ lm(a1_count ~ a2_count, data= .))

# Apply coef to each model and return a list of allele count ratios
lms <- lapply(models, coef)
ggplot()+geom_histogram(aes(x = sapply(lms,`[`,2)))+theme_classic()+ggtitle("allele ratios for all NA and HET calls")+geom_vline(aes(xintercept = 1.5), color = "red", linetype = 2)+geom_vline(aes(xintercept = (2/3)), color = "red", linetype = 2)+xlab("allele ratio (a1/a2)")+geom_vline(aes(xintercept = 1), color = "black")

#list of p-values
lms_anova <- lapply(models, summary)


# collect info about each bad model
paralog_possible <- which(abs(sapply(lms,`[`,2)) > 1.5) #bad because a positively skewed allele ratio
paralog_possible2 <- which(abs(sapply(lms,`[`,2)) < (2/3)) # bad because a negative skewed allele ratio

paralog_possible3 <- which(sapply(lms_anova, function(x) x$coefficients[,4][2])> 0.01) # bad because too much variance in allele ratio, even if mean ratio is 1

paralog_possible <- c(paralog_possible, paralog_possible2, paralog_possible3)
# R Local

plots <- marker_info %>%
  filter(marker %in% colnames(genos_0.4)) %>%
  filter(is.na(a1_count) == FALSE & is.na(a2_count) == FALSE) %>%
  group_by(marker) %>%
  do(plots=ggplot(data=.)+geom_point(aes(a1_count, a2_count, color = called_geno))+theme_classic()+geom_abline(aes(slope=1, intercept=0))+geom_abline(aes(slope = 10, intercept=0), color = "green")+geom_abline(aes(slope = 0.1, intercept=0), color = "red")+geom_abline(aes(slope = 0.2, intercept=0), color = "blue")+geom_abline(aes(slope = 5, intercept=0), color = "blue")+coord_equal(ratio=1)+geom_abline(slope = -1, intercept = 10)+ggtitle(unique(.$marker)))

#plot all "bad markers"

#first add the missningness markers to the list to examine
mod_bad_plot_index <- which(plots$marker %in% miss_mod)
paralog_possible <- c(mod_bad_plot_index, paralog_possible)

# then loop through the plots by changing the index (here 33) until you have looked at all your questionable markers
plots$plots[[paralog_possible[10]]] #manually looped through these plots by changing the index for all 33 moderately bad markers, could make an lapply loop in the future, bad markers reported below

Removed 6 bad markers

# Local R

to_filt <- c("Omy_RAD103359-45" , "Omy_128693-455", "Omy_CRBF1-1", "Omy_RAD103359-45", "Omy_RAD29352-6", "Omy_RAD74691-49") # here list your bad marker names, if you have so many that this is unwieldy check out code snippet at bottom of this chunk
genos_0.5 <- genos_0.4 %>%
  dplyr::select(-one_of(to_filt))

Monomorphic Markers and Duplicates

1.0 Monomorphic Markers

To generate the 1.0 dataset, we remove monomorphic markers

genos_1.0 <- genos_0.5 %>% 
  select_if(~ length(unique(.)) > 1)

Removed 30 monomorphic

File Conversion and Stats

Final step of genotyping is to collect some stats about the genotype dataset and reformat the genotype file into common formats for import into other programs.

Stats

Here are some summary stats and figures from your filtered dataset

# LOCAL R

ggplot(genos_2.0)+geom_density(aes(x=`On-Target Reads`))+geom_vline(aes(xintercept=median(`On-Target Reads`)), color = "red") +theme_classic()
On Target Read Distribution

On Target Read Distribution

#LOCAL R
ggplot(genos_2.0)+geom_density(aes(x=`%On-Target`))+geom_vline(aes(xintercept=median(`%On-Target`)), color = "red") +theme_classic()
Proportion on Target

Proportion on Target

Depths

#LOCAL R

#code to estimate depth at filtered loci
marker_info %>%
  filter(marker %in% colnames(genos_2.0)) %>%
  filter(ind %in% genos_2.0$Sample) %>%
  mutate(sumdepth=a1_count+a2_count) %>%
  summarise(mean=mean(sumdepth, na.rm = TRUE), median=median(sumdepth, na.rm = TRUE), sd=sd(sumdepth, na.rm = TRUE))
marker_info %>%
  filter(marker %in% colnames(genos_2.0)) %>%
  filter(ind %in% genos_2.0$Sample) %>%
  mutate(sumdepth=a1_count+a2_count) %>%
  ggplot + aes(x=sumdepth)+geom_histogram()+theme_classic()+xlab("Mean Depth Per Locus Per Individual")

Final Dataset

The final dataset is composed of 71 individuals genotyped at 347 markers.

Conversion

Let’s get some usable file formats

Here’s adegenet’s genind object

#LOCAL R

# Convert to genind for import into adegenet

#first get a matrix to work on

#first change column to not include a dot
genos_2.1 <- genos_2.0
colnames(genos_2.1) <- gsub("\\.", "_", colnames(genos_2.1))
#convert to matrix with inds as row names
genos_2.1 <- as.matrix(genos_2.1[,c(9:(ncol(genos_2.1)-1))]) #caution potential hardcoding to exclude sex marker, get rid of the "-1" if you don't have a sex marker
row.names(genos_2.1) <- genos_2.0$sample_simple
genind_1.0 <- df2genind(genos_2.1, sep ="", ploidy=2,NA.char = "0")

#add in the populations
genos_2.2 <- genos_2.0 %>%
  left_join(select(meta_data, `SFGL Id`, Date, `Hat/Wild`, Sex, `Est. Age`, pop), by=c("sample_simple" = "SFGL Id")) %>%
  select(-c(Sample)) %>% # dont need this anymore
  rename(sample = sample_simple) %>% #rename this column
  relocate(sample, Date, `Hat/Wild`, Sex, `Est. Age`, pop) # reorder the columns, this may be different depending on the metadata columns you added


genind_1.0@pop <- as.factor(genos_2.2$pop) # this might change depending on metadata, for example if yo wanted to name by the column "pop" in your metadata table, change to as.factor(genos_2.2$pop) 

Finally, save your files as R objects for further analysis.

# LOCAL R

# here we save a few objects with useful info
genind_2.0 <- genind_1.0
save(genos_2.2, file ="genotype_data/genotypes_2.2.R")
save(genind_2.0, file= "genotype_data/genind_2.0.R")