require(poppr)
require(genepop)
require(graph4lg)
require(related)
require(adegenet)
require(knitr)
require(tidyverse)
require(magrittr)
This is document is an R notebook. If you’d like view to pre-rendered figures, read a summary of analysis and interact with code, please open the relevant html file in a browser.
To conduct a similar analyses on your computer, edit or run code: clone this repository into a directory on your local machine and open the .Rproj file in Rstudio.
This is the genotyping log for the 2021 Coquille River O. mykiss project
Summary
- SFGL Run Number 022 - Gtseq Data using the SFGL Omy392/Omy391 panel:
- Raw data from UofO sequencing center is located at “/dfs/Omalley_Lab/fitz/Runs/4819”
- Arrived demultiplexed, lane also contains Ots data - 91 samples fastqs + 5 controls
fastqc
Fastqc report for merged demulitplexed from this project available in repo under genotype_data directory
Everything looked pretty typical for a GTseq run, perhaps a little high on adapter Raw reads: 37178964 Q: >34 until base 60 Adapter Sequence: Zero until base 20, then gradual increase up to 40% by base 40.
sheet1 <- readxl::read_xlsx("metadata/GT-seq_GC3F-CKF-005_metadata.xlsx", sheet = 1)
sheet2 <- readxl::read_xlsx("metadata/GT-seq_GC3F-CKF-005_metadata.xlsx", sheet = 2)
meta_data <- sheet1 %>%
bind_rows(sheet2) %>%
select(-1) %>%
mutate(pop = str_sub(`SFGL Id`, 8, 11))
kable(meta_data %>%
group_by(pop, `Hat/Wild`) %>%
summarise(n = n()) )
| pop | Hat/Wild | n |
|---|---|---|
| NFCQ | Hatchery | 15 |
| NFCQ | Wild | 26 |
| SFCQ | Hatchery | 22 |
| SFCQ | Wild | 28 |
kable(meta_data %>%
group_by(pop, Date) %>%
summarise(n = n()) )
| pop | Date | n |
|---|---|---|
| NFCQ | 2016-03-30 | 15 |
| NFCQ | 2016-07-11 | 10 |
| NFCQ | 2016-07-13 | 16 |
| SFCQ | 2016-02-17 | 7 |
| SFCQ | 2016-03-01 | 10 |
| SFCQ | 2016-03-30 | 12 |
| SFCQ | 2016-07-12 | 21 |
rm(sheet1)
rm(sheet2)
41 North Fork Coquille River Winter Steelhead (including 15 hatchery and 26 wild) and 50 South Fork Coquille River Winter Steelhead (including 22 hatchery and 28 wild)
Main Genotyper
# SERVER
# Decompression script
# note 1: this is a script to save and submit as a job, save everything below the long ########### below
#note 2: the number of threads to use (-t option) is hardcoded to match the number of input files, change this number to reflect how many fastq.gz files you have
################################# save everything below this as a file
#!/bin/bash
#$ -S /bin/bash
#$ -t 1-96
#$ -tc 60
#$ -N decompress
#$ -cwd
#$ -o $JOB_NAME_$TASK_ID.out
#$ -e $JOB_NAME_$TASK_ID.err
FASTQS=(`ls *fastq.gz`)
INFILE=${FASTQS[$SGE_TASK_ID -1]}
gunzip -c $INFILE > ${INFILE%.gz}
#save as script and submit this with qsub -q harold scriptname
####################################
# SERVER
# Genotyper Script
# note 1: this is a script to save and submit as a job, save everything below the long ########### below
#note 2: the number of threads to use (-t option) is hardcoded to match the number of input files, change this number to reflect how many fastq files you have in the directory
################################# save everything below this as a file
#!/bin/bash
#$ -S /bin/bash
#$ -t 1-96
#$ -tc 96
#$ -N GTseq-genotyperv3
#$ -cwd
#$ -o $JOB_NAME_$TASK_ID.out
#$ -e $JOB_NAME_$TASK_ID.err
export PERL5LIB='/home/fw/dayand/perl5/lib/perl5/x86_64-linux-thread-multi/' #you may want to change this to your own perl lib destination
FASTQS=(`ls ./*fastq`) #reminder to change the directory to your copy of the fastqs
INFILE=${FASTQS[$SGE_TASK_ID -1]}
OUTFILE=$(basename ${INFILE%.fastq}.genos)
GTSEQ_GENO="/dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/GTseq_Genotyper_v3.1.pl" #again, change this path to your own copy of this script
PROBE_SEQS="/dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/Omy_390_probe_seqs.csv" #change the probe seq file to match whatever panel you are using
perl $GTSEQ_GENO $PROBE_SEQS $INFILE > $OUTFILE
#save this code chunk as a file on the server and submit this with the following command from the directory you want the output .genos files:
# qsub -q harold scriptname
# note that you might submit to a different -q than harold
Sex Genotyper
After the genotypes are written for the panel, we add the sex genotyper.
SGE_Batch -q harold -r omysex -c 'perl /dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/OmySEX_test_v3.pl'
Compile Genotypes
After all the .genos are written, we compile them into a single output using the GenoCompile script
SGE_Batch -q harold -r compile -c 'perl /dfs/Omalley_Lab/dayan/software/GTseq-Pipeline/GTseq_GenoCompile_v3.pl > coquille_GTs_0.1.csv'
The first step in the QA-QC process is to collect some information about genotying success from the .genos files. We’ll do this with an awk one liner.
The script below will pull the allele count ratios and read counts for all individuals in the pipeline
# SERVER
#run from directory with your .genos (use a SGE_Batch job or interactive shell)
#collect marker info from all the genos files
touch marker_info.csv
echo 'ind,marker,a1_count,a2_count,called_geno,a1_corr,a2_corr' >> marker_info.csv
for file in ./*genos
do
awk -F"," ' BEGIN { OFS="," } {print FILENAME,$1,$2,$3,$6,$7,$8}' $file >> marker_info.csv
done
# now we'll cleanup this file so that it is easier to work with
sed -i '/Raw-Reads/d' ./marker_info.csv #first get rid of genos headers
sed -i '/negative/d' ./marker_info.csv # then get rid of controls
sed -i '/positive/d' ./marker_info.csv # then get rid of controls
sed -i '/Summer/d' ./marker_info.csv
sed -i '/Het/d' ./marker_info.csv
sed -i '/Winter/d' ./marker_info.csv
Read in the marker info file and clean it up a little more. Note you’ll have to transfer the file off the server for this.
#LOCAL R
marker_info <- read_csv("genotype_data/marker_info.csv")
#this part changes the values of A=2, G=898, -=52, etc for the allele count columns to the actual values, and gets rid of some mess in the sample names (ind)
marker_info %<>%
mutate(a1_count = as.numeric(substr(a1_count, 3, nchar(a1_count)))) %>%
mutate(a2_count = as.numeric(substr(a2_count, 3, nchar(a2_count)))) %>%
mutate(ind = str_remove(ind, "^\\./")) %>%
mutate(ind = str_remove(ind, "\\.genos")) %>%
filter(!(is.na(marker)))
fivefour <- filter(marker_info, marker =="Omy_RAD47080-54")
five3 <- filter(marker_info, marker =="Omy_RAD15709-53")
fivefour %<>%
mutate(a1_count = case_when(a1_count == 0 ~ 1,
TRUE ~ a1_count)) %>%
mutate(a2_count = case_when(a2_count == 0 ~ 1,
TRUE ~ a2_count)) %>%
mutate(allele_ratio_54 = a1_count/a2_count) %>%
rename(called_geno_54 = called_geno)
five3 %<>%
mutate(a1_count = case_when(a1_count == 0 ~ 1,
TRUE ~ a1_count)) %>%
mutate(a2_count = case_when(a2_count == 0 ~ 1,
TRUE ~ a2_count)) %>%
mutate(allele_ratio_53 = a2_count/a1_count) %>%#different strands so flip the ratios around
rename(called_geno_53 = called_geno)
comp <- select(fivefour, ind, allele_ratio_54, called_geno_54) %>%
left_join(select(five3, ind, allele_ratio_53, called_geno_53)) %>% #switch the called geno around for one of these because the probe seq is different strand
mutate(called_geno_54 = case_when(called_geno_54 == "A1HOM" ~ "A2HOM",
called_geno_54 == "A2HOM" ~ "A1HOM",
TRUE ~ called_geno_54)) %>%
mutate(agree = called_geno_54 == called_geno_53)
ggplot(data = comp)+geom_point(aes(log(allele_ratio_54), log(allele_ratio_53), color = agree))+theme_classic()+geom_abline(aes(slope =1, intercept = 0))
First let’s check that the controls worked well. We will check that negative controls have much fewer reads than average (there may be some on-target reads from othr samples due to index sequence error)
# LOCAL R
# First we are going to prep the raw genotype data for filtering.
# read the raw genotypes file in to R
genos_0.1 <- read_csv("genotype_data/coquille_GTs_0.1.csv")
# add a field to mark controls
# here controls contained "positive," "negative" in their sample names so used simple pattern matching to create a new column, you can add you own here for known controls (e.g. known winter run steelhead)
genos_0.1 %<>%
mutate(control = case_when(str_detect(Sample, "positive") ~ "positive",
str_detect(Sample, "negative") ~ "negative",
str_detect(Sample, "Het") ~ "het",
TRUE ~ "sample"))
# clean up sample name field
# split the sample name and the adapter sequence (note that replicates will have the same sample name, but we'll keep track with the adapter sequences)
genos_0.1 %<>%
mutate(adapter = str_extract(Sample, "[ATCG]{6}_[ATCG]{6}")) %>%
mutate(sample_simple = str_extract(Sample, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")) %>%
relocate(Sample, sample_simple, adapter)
# great, prep is done, now lets make our first plot: distribution of reads between controls and samples
ggplot()+geom_histogram(data = genos_0.1, aes(x = `On-Target Reads`, fill= control)) + theme_classic()+scale_fill_viridis_d()
A lot of failed samples in this run (~15), but controls look good.
#LOCAL R
ggplot()+geom_histogram(data = genos_0.1, aes(x = `%On-Target`, fill= control)) + theme_classic()+scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
No replicates in this data.
#LOCAL R
# here we filter out our known controls and create our next dataset genos_0.11
genos_0.11 <- genos_0.1 %>%
filter(control == "sample") %>%
select(-control) #get rid of this column
# sometimes we'll use more than a single replicate, this broke a previous version of this code chunk, for now we'll find the triplicate (or more) samples, and keep the two with greatest on target read depth (ignoring triplicates quadruplicates etc)
genos_0.11 %<>%
group_by(sample_simple) %>%
slice_max(order_by = `On-Target Reads`, n = 2)
#now let's get duplicated samples
#dups <- genos_0.11[genos_0.11$sample_simple %in% genos_0.11$sample_simple[duplicated(genos_0.11$sample_simple)],]
#dups <- dups[order(dups$sample_simple),]
# next we'll calculate the percent concordance among replicates
# woof I don't see a good way around using a nested for loop here, maybe fix this in the future
#dups_genos <- dups[,9:ncol(dups)]
#rep_info <- matrix(ncol=ncol(dups_genos), nrow=nrow(dups_genos)/2)
#colnames(rep_info) <- colnames(dups_genos)
#for (j in 1:(nrow(dups_genos)/2)) {
#for (i in 1:ncol(dups_genos)) {
# rep_info[j,i] <- sum(dups_genos[(j*2)-1,i]==dups_genos[(j*2),i])
#}
# }
#geno_concordance <- as.data.frame(as.matrix(rep_info)) %>%
# rowMeans()
#rep_data <- as.data.frame(cbind(dups[c(1:length(geno_concordance))*2,1], geno_concordance))
#ggplot(data=rep_data)+geom_histogram(aes(x=geno_concordance))+theme_classic()
Replication Summary
Again this part of the SOP isnt doing anything because there are no replicates … Next let’s make the 0.2 dataset (i.e. remove the replicates with lower GT success).
# LOCAL R
#this writes a new dataset (0.2) by choosing the samples within duplicates and keeping the one with the highest genotyping success
genos_0.2 <- genos_0.11 %>%
group_by(sample_simple) %>%
filter(`On-Target Reads` == max(`On-Target Reads`))
If you are using the OmySEX and OtsSEX scripts to call sex genotypes, read this section. If else, skip it, and move on to filtering.
The OmySEX and OtsSEX scripts rely on hardcoded estimates of the proportion of reads dedicated to the sex marker to call sex genotypes. This can sometimes go awry if the sex marker does not amplify as expected during the library prep (e.g. primers are aging, proportion of primers in the primer pool is off, etc).
In this portion of the SOP we will check if the sex genotyping script worked well, and if not, apply a correction.
First, let’s check if the script worked correctly.
#LOCAL R
# Plot sex marker counts
sex_marker_info <- marker_info %>%
filter(str_detect(marker, "SEXY")) %>%
mutate(called_geno = replace_na(called_geno, "00")) %>%
mutate(called_geno = case_when(called_geno == "A1HOM" ~ "XX",
called_geno == "HET" ~ "XY",
called_geno == "00" ~ "00"))
ggplot(data = sex_marker_info)+geom_point(aes(a2_count, a1_count, color = called_geno))+scale_colour_viridis_d(name = "Called Sex Genotype")+theme_classic()+xlab("Y-specific probe count")+ylab("Theoretical X chromosome count")+geom_abline(aes(intercept = 0, slope = 0.1))+geom_abline(aes(intercept = 0, slope = 5))+geom_abline(aes(intercept = 0, slope = 10))+geom_abline(aes(intercept = 0, slope = 0.2))+xlim(0,max(c(sex_marker_info$a1_count, sex_marker_info$a2_count)))+ylim(0,max(c(sex_marker_info$a1_count, sex_marker_info$a2_count)))+geom_abline(aes(intercept = 0, slope = 1), color = "darkred")
kable(sex_marker_info %>% count(called_geno), caption = "Called Sex Ratio")
| called_geno | n |
|---|---|
| 00 | 15 |
| XX | 38 |
| XY | 38 |
Yes needs correction.
Here, instead of relying on a harcoded value for the proportion of reads dedicated to the sex marker, we will attempt to estimate this value from the empirical data, and use this estimate to re-call sex genotypes.
#LOCAL R
# First let's collect the number of on-target reads for each sample in the sex_marker_info dataframe
sex_marker_info$sample <- str_extract(sex_marker_info$ind, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")
sex_marker_info$adapter <- str_extract(sex_marker_info$ind, "[ATCG]{6}_[ATCG]{6}")
sex_marker_info %<>%
left_join(dplyr::select(genos_0.1, sample_simple, adapter, `On-Target Reads`), by = c("sample" = "sample_simple" , "adapter" = "adapter"))
# now here's the tricky part. we're going to do a first pass filter to eliminate any samples that coldn't possibly be males, then estimate the theoretical number of X chromosome counts for possible males
#first get the estimate
Xmale_proportion_error <- sex_marker_info %>%
filter(a2_count > 1) %>% #this value is set to 1 if the actual count is zero in the omysexscript, so here we're are eliminating all individuals where it woulbe extremely unlikely that they are males
summarise(Xmale_proportion = mean(a2_count/`On-Target Reads`))
#then use the estimate to correct the data
sex_marker_info %<>%
mutate(XY_count_estimate = Xmale_proportion_error[[1]]*`On-Target Reads`) %>%
mutate(corrected_ratio = XY_count_estimate/a2_count) %>%
mutate(corrected_sex_genotype = case_when(corrected_ratio > 10 ~ "XX",
corrected_ratio <= 0.1 ~ "XY",
corrected_ratio <= 0.2 ~ "00",
corrected_ratio <= 5 ~ "XY",
TRUE ~ "00"
)) #note these ratios are taken directly from the omysex script but this seems like another bug, shouldn't XY < 0.2 be NA not XY???
Now we’ll plot again and see if this correction worked as expected.
#LOCAL R
ggplot(data = sex_marker_info)+geom_point(aes(a2_count, XY_count_estimate, color = corrected_sex_genotype), alpha = 0.7)+scale_colour_viridis_d(name = "Corrected Called\nSex Genotype")+theme_classic()+xlab("Y-specific probe count")+ylab("Corrected Theoretical X chromosome count")+geom_abline(aes(intercept = 0, slope = 0.1))+geom_abline(aes(intercept = 0, slope = 5))+geom_abline(aes(intercept = 0, slope = 10))+geom_abline(aes(intercept = 0, slope = 0.2))+xlim(0,max(c(sex_marker_info$XY_count_estimate, sex_marker_info$a2_count)))+ylim(0,max(c(sex_marker_info$XY_count_estimate, sex_marker_info$a2_count)))+geom_abline(aes(intercept = 0, slope = 1), color = "darkred")
kable(sex_marker_info %>%
group_by(corrected_sex_genotype) %>%
summarise(count = n(), mean_depth = mean(`On-Target Reads`)), caption = "Corrected Called Sex Ratio")
| corrected_sex_genotype | count | mean_depth |
|---|---|---|
| 00 | 5 | 11194.40 |
| XX | 36 | 137968.31 |
| XY | 50 | 77497.84 |
This correction looks like it worked: Called males follow a 1:1 line and there are very few uncalled sex genotypes, except for samples with very low total on-target reads. Also the overall sex ratio of this sample is closer to our a priori expectations.
Finally let’s correct the sex genotypes in the data.
#LOCAL R
# run this command for Omy
genos_0.2 %<>%
left_join(select(sex_marker_info, sample, adapter, corrected_sex_genotype), by = c("sample_simple"="sample", "adapter"="adapter")) %>%
mutate(OmyY1_2SEXY = corrected_sex_genotype) %>%
select(-corrected_sex_genotype)
Control and replicates have been removed, now it’s time for filtering.
Filtering Summary
We take an iterative approach to filtering:
First remove worst individuals and genotypes: - GTperc_cutoff=30 (indivudals greater than 30% missing data excluded) - Missingness (loci) > 50% (loci with total missing data > 50% removed) - IFI_cutoff = 10 (i.e. >10% background reads)
Then recalculate missingness and IFI - IFI_cutoff=2.5
- GTperc_cutoff=90 (inds greater than 10% missing data excluded)
- Missingness (loci) > 20%
Then examine for paralogues among markers with
- Missingness (loci) > 10% - examine for allele correction issues
- Markers where heterozygotes and “in-betweeners” do not follow 1:1 ratio of allele counts - Markers with high variance in ratio of allele counts at heteroyzgotes and “in-betweeners” - Remove monomorphic SNPs
First we filter individuals and loci on IFI, and missingness.
Let’s take a look at the distribution of these values before any filtering
#LOCAL R
ggplot(genos_0.2)+geom_histogram(aes(x=IFI))+geom_vline(aes(xintercept= 2.5), color="red")+theme_classic()
ggplot(genos_0.2)+geom_histogram(aes(x=`%GT`))+geom_vline(aes(xintercept= 90), color="red")+theme_classic()
missingness <- (colSums(genos_0.2[,c(9:ncol(genos_0.2))] == "00" | genos_0.2[,c(8:(ncol(genos_0.2)-1))] == "0"))/nrow(genos_0.2) #warning hardcoding: "[,8:398]" is hardcoded to work on the example script using the Omy panel with 390 markers, these values will need to be changed to reflect the genotype columns of the genos r object that YOU are running. This excludes columns with metadata and genotyping results such as "sample name" "ifi" "on-target reads" etc
missing <- as.data.frame(missingness)
missing$marker <- row.names(missing)
ggplot(missing) + geom_histogram(aes(x=missingness))+geom_vline(aes(xintercept= 0.2), color="red")+geom_vline(aes(xintercept= 0.1), color="blue")+theme_classic()+xlab("missingness (loci)")
0.3: Extremely Bad Loci and Individuals Excluded
First remove the individuals and markers that clearly failed to genotype correctly (one step at a time)
#print table of bad missingness individual
kable(genos_0.2 %>%
filter(`%GT` < 70) %>%
select(2,4,5,6,7), caption = paste(nrow(genos_0.2 %>% filter(`%GT` < 70)), "Individuals with high missingess (>30% missing data)"))
| sample_simple | Raw Reads | On-Target Reads | %On-Target | %GT |
|---|---|---|---|---|
| OmyAC16SFCQ_0009 | 303860 | 265 | 0.09 | 0.00 |
| OmyAC16SFCQ_0016 | 388263 | 2202 | 0.57 | 13.59 |
| OmyAC16SFCQ_0017 | 219946 | 209 | 0.10 | 0.00 |
| OmyJC16NFCQ_0008 | 359953 | 518 | 0.14 | 0.00 |
| OmyJC16NFCQ_0011 | 316462 | 279 | 0.09 | 0.00 |
| OmyJC16NFCQ_0018 | 364362 | 2079 | 0.57 | 18.46 |
| OmyJC16NFCQ_0033 | 255828 | 1567 | 0.61 | 9.49 |
| OmyJC16NFCQ_0037 | 252004 | 241 | 0.10 | 0.00 |
| OmyJC16SFCQ_0026 | 199637 | 254 | 0.13 | 0.00 |
| OmyJC16SFCQ_0027 | 259771 | 424 | 0.16 | 0.51 |
| OmyJC16SFCQ_0034 | 202708 | 483 | 0.24 | 0.51 |
| OmyJC16SFCQ_0038 | 176997 | 4206 | 2.38 | 39.23 |
| OmyJC16SFCQ_0041 | 318705 | 9208 | 2.89 | 68.97 |
| OmyJC16SFCQ_0043 | 240443 | 2732 | 1.14 | 22.31 |
| OmyJC16SFCQ_0045 | 200057 | 1044 | 0.52 | 3.85 |
# now remove them
genos_0.3 <- genos_0.2 %>%
filter(`%GT` > 70)
#now recalculate locus level missingness after removing the worst individuals
missingness2 <- (colSums(genos_0.3[,c(9:(ncol(genos_0.3)))] == "00" | genos_0.3[,c(9:(ncol(genos_0.3)))] == "0"))/nrow(genos_0.3)
missing2 <- as.data.frame(missingness2)
missing2$marker <- row.names(missing2)
#then remove these markers
# collect bad markers
very_bad_markers <- missing2[missing2$missingness2>0.5, 2]
print(paste(length(very_bad_markers), "markers with > 50% missing data"))
## [1] "2 markers with > 50% missing data"
#write the new dataset
genos_0.3 <- genos_0.3 %>%
dplyr::select(-one_of(very_bad_markers))
#then recalculate IFI
# IFI is equal to the percentage of "background" reads to homozygote reads. Two types of reads contribute to background count: (1) Reads from the alternative allele when an individual has been called as homozygote at a locus, and (2) reads from the less frequent allele when the individual has been called as "in-betweener". We update the IFI score by including only markers in the filtered dataset
IFI <- marker_info %>%
filter(marker %in% colnames(genos_0.3)) %>%
group_by(ind) %>%
summarize(back_count = sum(a1_count[called_geno == "A2HOM"], na.rm = TRUE)
+ sum(a2_count[called_geno == "A1HOM"], na.rm = TRUE)
+ sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
+ sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
hom_ct = sum(a1_count[called_geno == "A1HOM"], na.rm = TRUE)
+ sum(a2_count[called_geno == "A2HOM"], na.rm = TRUE)
+ sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
+ sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
ifi2 = (back_count/hom_ct)*100)
# the "marker_info" file we produced earlier used the filename of the genos file as the sample name (column name "ind"), but the sample names in our local R dataframes are very cleaned up (see line 504). Here I attempt to do the same using some regex in R using the standardized codes for sample naming at SFGL, but note that depending on how your fastq files are named, these exact matches may not work for you
# until we find a better solution I suggest two alternatives if this regex below breaks
# 1: if the number of high IFI samples is very low, just write the sample names out manually to a vector and use this to filter
# 2:
IFI$sample <- str_extract(IFI$ind, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")
IFI$adapter <- str_extract(IFI$ind, "[ATCG]{6}_[ATCG]{6}")
genos_0.3 <- genos_0.3 %>%
left_join(select(IFI, sample, adapter, ifi2), by = c("sample_simple" = "sample", "adapter" = "adapter")) %>%
mutate(IFI = ifi2) %>%
select(-one_of("ifi2"))
# now filter on IFI
#print table of bad IFI samples
kable(genos_0.3 %>%
filter(IFI >10) %>%
select(2:7), caption = "Extreme High IFI (>10) samples (low confidence barcodes)")
| sample_simple | adapter | Raw Reads | On-Target Reads | %On-Target | %GT |
|---|
#update the dataset
genos_0.3 <- genos_0.3 %>%
filter(IFI < 10)
Filtering log 0.2 -> 0.3:
15 high missingness inds 2 markers high missingness 0 IFI
0.4 Second Iteration Filter
Next we do the same process, but at the final filtering levels:
#print table of bad missingness individual
kable(genos_0.3 %>%
filter(`%GT` < 90) %>%
select(2,4,5,6,7,8), caption = paste(nrow(genos_0.3 %>% filter(`%GT` < 90)), "Individuals with high missingess (>10% missing data)"))
| sample_simple | Raw Reads | On-Target Reads | %On-Target | %GT | IFI |
|---|---|---|---|---|---|
| OmyAC16SFCQ_0005 | 228966 | 24288 | 10.61 | 87.18 | 0.3561465 |
| OmyAC16SFCQ_0006 | 227920 | 13695 | 6.01 | 74.36 | 2.8393758 |
| OmyJC16SFCQ_0024 | 384926 | 45415 | 11.80 | 79.49 | 0.6486706 |
# now remove them
genos_0.4 <- genos_0.3 %>%
filter(`%GT` > 90)
#now recalculate locus level missingness after removing the worst individuals
missingness3 <- (colSums(genos_0.4[,c(9:(ncol(genos_0.4)))] == "00" | genos_0.4[,c(9:(ncol(genos_0.4)))] == "0"))/nrow(genos_0.4)
missing3 <- as.data.frame(missingness3)
missing3$marker <- row.names(missing3)
#then remove these markers
# collect bad markers
bad_markers <- missing3[missing3$missingness3>0.2, 2]
print(paste(length(bad_markers), "markers with > 20% missing data"))
## [1] "6 markers with > 20% missing data"
#write the new dataset
genos_0.4 <- genos_0.4 %>%
dplyr::select(-one_of(bad_markers))
#then recalculate IFI
# IFI is equal to the percentage of "background" reads to homozygote reads. Two types of reads contribute to background count: (1) Reads from the alternative allele when an individual has been called as homozygote at a locus, and (2) reads from the less frequent allele when the individual has been called as "in-betweener"
IFI <- marker_info %>%
filter(marker %in% colnames(genos_0.4)) %>%
group_by(ind) %>%
summarize(back_count = sum(a1_count[called_geno == "A2HOM"], na.rm = TRUE)
+ sum(a2_count[called_geno == "A1HOM"], na.rm = TRUE)
+ sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
+ sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
hom_ct = sum(a1_count[called_geno == "A1HOM"], na.rm = TRUE)
+ sum(a2_count[called_geno == "A2HOM"], na.rm = TRUE)
+ sum(a2_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a2_count > a1_count)], na.rm = TRUE )
+ sum(a1_count[is.na(called_geno) == TRUE & ((a1_count + a2_count)>=10) & (a1_count > a2_count)], na.rm = TRUE ),
ifi2 = (back_count/hom_ct)*100)
# the "marker_info" file we produced earlier used the filename of the genos file as the sample name (column name "ind"), but the sample names in our local R dataframes are very cleaned up (see line 504). Here I attempt to do the same using some regex in R using the standardized codes for sample naming at SFGL, but note that depending on how your fastq files are named, these exact matches may not work for you
# until we find a better solution I suggest two alternatives if this regex below breaks
# 1: if the number of high IFI samples is very low, just write the sample names out manually to a vector and use this to filter
# 2:
IFI$sample <- str_extract(IFI$ind, "[:upper:][:lower:]{2}[AJCU][RC]\\d{2}\\w{4}_\\d{4}")
IFI$adapter <- str_extract(IFI$ind, "[ATCG]{6}_[ATCG]{6}")
genos_0.4 <- genos_0.4 %>%
left_join(select(IFI, sample, adapter, ifi2), by = c("sample_simple" = "sample", "adapter" = "adapter")) %>%
mutate(IFI = ifi2) %>%
select(-one_of("ifi2"))
# now filter on IFI
#print table of bad IFI samples
kable(genos_0.4 %>%
filter(IFI >2.5) %>%
select(2:7), caption = "High IFI (>2.5) samples (low confidence barcodes)")
| sample_simple | adapter | Raw Reads | On-Target Reads | %On-Target | %GT |
|---|---|---|---|---|---|
| OmyJC16NFCQ_0042 | TGACCA_TAGTAT | 238493 | 49634 | 20.81 | 91.03 |
| OmyJC16NFCQ_0044 | TGACCA_TGCTTA | 595310 | 236707 | 39.76 | 91.03 |
#update the dataset
genos_0.4 <- genos_0.4 %>%
filter(IFI < 2.5)
0.3 -> 0.4 Filtering Log
Filtered out:
3 individuals with <90% genotying success (i.e. greater than 10% missing data)
6 markers with > 20% missingness
2 contaminated samples (note here that all the samples with high IFI are already removed by the individual level missingness step in the example data)
0.5: Removing Paralogs
Now we manually examine allele counts for markers that may tag paralogues regions. Because our panels can contain hundreds of loci, we flag three types of markers for close scrutiny (below), but this is informal and you can also look at any marker you want using some of the scripts below.
- Missingness (loci) > 10% - examine for allele correction issues
- Markers where heterozygotes and “in-betweeners” do not follow 1:1 ratio of allele counts - Markers with high variance in ratio of allele counts at heteroyzgotes and “in-betweeners”
Let’s collect these markers, first markers with high missingness (10-20% missingness)
# Local R
#get marker names of markers with 0.1 > missingness > 0.2
miss0.1 <- missing3[missing3$missingness3 > 0.1,]
miss_mod <- miss0.1[miss0.1$missingness3 < 0.2, 2]
Next, markers with skewed allele count ratios and allele ratios with high variance. We do this by fitting a linear model between allele 1 counts and allele 2 counts and then flagging markers with a ratio of > 1.5 (3/2) and less than 2/3. We also flag markers where the fit
library(lme4)
hets <- filter(marker_info, called_geno == "HET" | is.na(called_geno))
models <- hets %>%
filter(marker %in% colnames(genos_0.4)) %>%
filter(is.na(a1_count) == FALSE & is.na(a2_count) == FALSE) %>%
group_by(marker) %>%
group_map(~ lm(a1_count ~ a2_count, data= .))
# Apply coef to each model and return a list of allele count ratios
lms <- lapply(models, coef)
ggplot()+geom_histogram(aes(x = sapply(lms,`[`,2)))+theme_classic()+ggtitle("allele ratios for all NA and HET calls")+geom_vline(aes(xintercept = 1.5), color = "red", linetype = 2)+geom_vline(aes(xintercept = (2/3)), color = "red", linetype = 2)+xlab("allele ratio (a1/a2)")+geom_vline(aes(xintercept = 1), color = "black")
#list of p-values
lms_anova <- lapply(models, summary)
# collect info about each bad model
paralog_possible <- which(abs(sapply(lms,`[`,2)) > 1.5) #bad because a positively skewed allele ratio
paralog_possible2 <- which(abs(sapply(lms,`[`,2)) < (2/3)) # bad because a negative skewed allele ratio
paralog_possible3 <- which(sapply(lms_anova, function(x) x$coefficients[,4][2])> 0.01) # bad because too much variance in allele ratio, even if mean ratio is 1
paralog_possible <- c(paralog_possible, paralog_possible2, paralog_possible3)
# R Local
plots <- marker_info %>%
filter(marker %in% colnames(genos_0.4)) %>%
filter(is.na(a1_count) == FALSE & is.na(a2_count) == FALSE) %>%
group_by(marker) %>%
do(plots=ggplot(data=.)+geom_point(aes(a1_count, a2_count, color = called_geno))+theme_classic()+geom_abline(aes(slope=1, intercept=0))+geom_abline(aes(slope = 10, intercept=0), color = "green")+geom_abline(aes(slope = 0.1, intercept=0), color = "red")+geom_abline(aes(slope = 0.2, intercept=0), color = "blue")+geom_abline(aes(slope = 5, intercept=0), color = "blue")+coord_equal(ratio=1)+geom_abline(slope = -1, intercept = 10)+ggtitle(unique(.$marker)))
#plot all "bad markers"
#first add the missningness markers to the list to examine
mod_bad_plot_index <- which(plots$marker %in% miss_mod)
paralog_possible <- c(mod_bad_plot_index, paralog_possible)
# then loop through the plots by changing the index (here 33) until you have looked at all your questionable markers
plots$plots[[paralog_possible[10]]] #manually looped through these plots by changing the index for all 33 moderately bad markers, could make an lapply loop in the future, bad markers reported below
Removed 6 bad markers
# Local R
to_filt <- c("Omy_RAD103359-45" , "Omy_128693-455", "Omy_CRBF1-1", "Omy_RAD103359-45", "Omy_RAD29352-6", "Omy_RAD74691-49") # here list your bad marker names, if you have so many that this is unwieldy check out code snippet at bottom of this chunk
genos_0.5 <- genos_0.4 %>%
dplyr::select(-one_of(to_filt))
1.0 Monomorphic Markers
To generate the 1.0 dataset, we remove monomorphic markers
genos_1.0 <- genos_0.5 %>%
select_if(~ length(unique(.)) > 1)
Removed 30 monomorphic
Final step of genotyping is to collect some stats about the genotype dataset and reformat the genotype file into common formats for import into other programs.
Here are some summary stats and figures from your filtered dataset
# LOCAL R
ggplot(genos_2.0)+geom_density(aes(x=`On-Target Reads`))+geom_vline(aes(xintercept=median(`On-Target Reads`)), color = "red") +theme_classic()
On Target Read Distribution
#LOCAL R
ggplot(genos_2.0)+geom_density(aes(x=`%On-Target`))+geom_vline(aes(xintercept=median(`%On-Target`)), color = "red") +theme_classic()
Proportion on Target
Depths
#LOCAL R
#code to estimate depth at filtered loci
marker_info %>%
filter(marker %in% colnames(genos_2.0)) %>%
filter(ind %in% genos_2.0$Sample) %>%
mutate(sumdepth=a1_count+a2_count) %>%
summarise(mean=mean(sumdepth, na.rm = TRUE), median=median(sumdepth, na.rm = TRUE), sd=sd(sumdepth, na.rm = TRUE))
marker_info %>%
filter(marker %in% colnames(genos_2.0)) %>%
filter(ind %in% genos_2.0$Sample) %>%
mutate(sumdepth=a1_count+a2_count) %>%
ggplot + aes(x=sumdepth)+geom_histogram()+theme_classic()+xlab("Mean Depth Per Locus Per Individual")
The final dataset is composed of 71 individuals genotyped at 347 markers.
Let’s get some usable file formats
Here’s adegenet’s genind object
#LOCAL R
# Convert to genind for import into adegenet
#first get a matrix to work on
#first change column to not include a dot
genos_2.1 <- genos_2.0
colnames(genos_2.1) <- gsub("\\.", "_", colnames(genos_2.1))
#convert to matrix with inds as row names
genos_2.1 <- as.matrix(genos_2.1[,c(9:(ncol(genos_2.1)-1))]) #caution potential hardcoding to exclude sex marker, get rid of the "-1" if you don't have a sex marker
row.names(genos_2.1) <- genos_2.0$sample_simple
genind_1.0 <- df2genind(genos_2.1, sep ="", ploidy=2,NA.char = "0")
#add in the populations
genos_2.2 <- genos_2.0 %>%
left_join(select(meta_data, `SFGL Id`, Date, `Hat/Wild`, Sex, `Est. Age`, pop), by=c("sample_simple" = "SFGL Id")) %>%
select(-c(Sample)) %>% # dont need this anymore
rename(sample = sample_simple) %>% #rename this column
relocate(sample, Date, `Hat/Wild`, Sex, `Est. Age`, pop) # reorder the columns, this may be different depending on the metadata columns you added
genind_1.0@pop <- as.factor(genos_2.2$pop) # this might change depending on metadata, for example if yo wanted to name by the column "pop" in your metadata table, change to as.factor(genos_2.2$pop)
Finally, save your files as R objects for further analysis.
# LOCAL R
# here we save a few objects with useful info
genind_2.0 <- genind_1.0
save(genos_2.2, file ="genotype_data/genotypes_2.2.R")
save(genind_2.0, file= "genotype_data/genind_2.0.R")