1. Load R libraries and software

library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
library(here)
library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
library(reticulate)
library(scales)
library(stringr)
library(grid)
library(flextable)
library(devtools)
library(readr)
library(purrr)
library(ggtext)
library(ggvenn)
#library(data.table)
library(RColorBrewer)
library(ggrepel)

1.1. Set directory for the project

cd /gpfs/gibbs/pi/caccone/mkc54/albo

1.2. Get software for quality control

Check your Plink version.

module spider plink
#Versions on the cluster:
        #LINK/1.9b_6.21-x86_64
        #LINK/2_avx2_20221024
module load PLINK/2_avx2_20221024

plink2 --version
#PLINK v2.00a3.7LM AVX2 Intel (24 Oct 2022)

Check your BCFtools version.

module spider BCFtools
#BCFtools/1.16-GCCcore-10.2.0

module load BCFtools/1.16-GCCcore-10.2.0

1.3. Check if the data is in the correct location

ls /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/* # we use * to truncate the name of the file showing all files

cd /gpfs/gibbs/pi/caccone/mkc54/albo
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/albo_euro_global_1Dec2023.log
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/albo_euro_global_1Dec2023.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/albo_europe_11Sep2023.log
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/albo_europe_11Sep2023.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/euro_asia_8america_4africa_28Nov2023.log
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/euro_asia_8america_4africa_28Nov2023.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/europe_native_11Sep2023.log
## /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/europe_native_11Sep2023.vcf

I prepared the data with the families and individual ids. I also set the reference alleles to match the AlbF3 genome assembly

We can check how many sample names we have in our vcf

# make sure you have all the .CEL samples in your family file
bcftools query -l /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/albo_europe_11Sep2023.vcf | wc -l
#423

2. The data

2.1. Use Plink2 to convert to bed format

plink2 \
--allow-extra-chr \
--vcf data/europe/albo_europe_11Sep2023.vcf \
--const-fid \
--make-bed \
--exclude europe/output/files/albopictus_SNPs_fail_segregation.txt \
--fa /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz \
--ref-from-fa 'force' `# sets REF alleles when it can be done unambiguously, we use force to change the alleles` \
--out europe/output/file1 \
--silent;
# --keep-allele-order \ if you use Plink 1.9
grep "variants" europe/output/file1.log; # to get the number of variants from the log file.

–vcf: 117981 variants scanned. 117981 variants loaded from europe/output/file1-temporary.pvar.zst. –exclude: 116475 variants remaining. 116475 variants remaining after main filters. –ref-from-fa force: 0 variants changed, 116475 validated.

Check the fam file

cd /gpfs/gibbs/pi/caccone/mkc54/albo

head europe/output/file1.fam
## 0    1065_SOC.CEL    0   0   0   -9
## 0    1066_SOC.CEL    0   0   0   -9
## 0    1067_SOC.CEL    0   0   0   -9
## 0    1068_SOC.CEL    0   0   0   -9
## 0    1069_SOC.CEL    0   0   0   -9
## 0    1070_SOC.CEL    0   0   0   -9
## 0    1071_SOC.CEL    0   0   0   -9
## 0    1072_SOC.CEL    0   0   0   -9
## 0    1073_SOC.CEL    0   0   0   -9
## 0    1074_SOC.CEL    0   0   0   -9
cd /gpfs/gibbs/pi/caccone/mkc54/albo
head -n 5 scripts/RMarkdowns/sample_ped_info_ALLPOPS.txt
## Sample Filename  Family_ID   Individual_ID   Father_ID   Mother_ID   Sex Affection Status
## 8_MAN_Brazil.CEL MAU 8   0   0   0   -9
## 9_MAN_Brazil.CEL MAU 9   0   0   0   -9
## 16_MAN_Brazil.CEL    MAU 16  0   0   0   -9
## 17_MAN_Brazil.CEL    MAU 17  0   0   0   -9

2.2. Use R to update the .fam file

Import the fam file we use with Axiom Suite

# the order of the rows in this file does not matter
samples <-
  read.delim(
    file   = here(
      "sample_ped_info_ALLPOPS.txt"
    ),
    header = TRUE
  )
head(samples)
##     Sample.Filename Family_ID Individual_ID Father_ID Mother_ID Sex
## 1  8_MAN_Brazil.CEL       MAU             8         0         0   0
## 2  9_MAN_Brazil.CEL       MAU             9         0         0   0
## 3 16_MAN_Brazil.CEL       MAU            16         0         0   0
## 4 17_MAN_Brazil.CEL       MAU            17         0         0   0
## 5 18_MAN_Brazil.CEL       MAU            18         0         0   0
## 6 60_MAN_Brazil.CEL       MAU            60         0         0   0
##   Affection.Status
## 1               -9
## 2               -9
## 3               -9
## 4               -9
## 5               -9
## 6               -9

1298 obs. of 7 variables

Import .fam file we created once we created the bed file using Plink2

#The fam file is the same for both data sets with the default or new priors
fam1 <-
  read.delim(
    file   = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1.fam"),
    header = FALSE,
    
  )
head(fam1)
##   V1           V2 V3 V4 V5 V6
## 1  0 1065_SOC.CEL  0  0  0 -9
## 2  0 1066_SOC.CEL  0  0  0 -9
## 3  0 1067_SOC.CEL  0  0  0 -9
## 4  0 1068_SOC.CEL  0  0  0 -9
## 5  0 1069_SOC.CEL  0  0  0 -9
## 6  0 1070_SOC.CEL  0  0  0 -9

We can merge the tibbles.

# Extract the number part from the columns
fam1_temp <- fam1 |>
  mutate(num_id = as.numeric(str_extract(V2, "^\\d+")))

samples_temp <- samples |>
  mutate(num_id = as.numeric(str_extract(Sample.Filename, "^\\d+")))

# Perform the left join using the num_id columns and keep the order of fam1
df <- fam1_temp |>
  dplyr::left_join(samples_temp, by = "num_id") |>
  dplyr::select(-num_id) |>
  dplyr::select(8:13)

# check the data frame
head(df)
##   Family_ID Individual_ID Father_ID Mother_ID Sex Affection.Status
## 1       SOC          1065         0         0   0               -9
## 2       SOC          1066         0         0   0               -9
## 3       SOC          1067         0         0   0               -9
## 4       SOC          1068         0         0   0               -9
## 5       SOC          1069         0         0   0               -9
## 6       SOC          1070         0         0   0               -9

We can check how many samples we have in our file

nrow(df)
## [1] 423

Before you save the new fam file, you can change the original file to a different name, to compare the order later. If you want to repeat the steps above after you saving the new file1.fam, you will need to import the vcf again.

# Save and override the .fam file for dp
write.table(
  df,
  file      = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam"),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Check the new .fam file to see if has the order and the sample attributes we want.

# you can open the file on a text editor and double check the sample order and information.
head -n 5 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam
## SOC  1065    0   0   0   -9
## SOC  1066    0   0   0   -9
## SOC  1067    0   0   0   -9
## SOC  1068    0   0   0   -9
## SOC  1069    0   0   0   -9

2.3. Remove duplicates

We have 4 samples genotyped 3 times each. We can keep only one of them

We can check for the triplicates in our dataset

# Check triplicates
cd /gpfs/gibbs/pi/caccone/mkc54/albo
grep "a\|b\|c" /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam

We don’t have any duplicates in this dataset, we can skip this step

To subset the data we need to create a list of samples with family id and individual ids

cd /gpfs/gibbs/pi/caccone/mkc54/albo
grep "b\|c" europe/output/file1b.fam | awk '{print $1, $2}' > europe/output/files/duplicates_to_remove.txt;
head europe/output/files/duplicates_to_remove.txt

Create a new bed removing the duplicated samples. We can also make sure the reference alleles match the reference genome

# I created a fam file with the information about each sample, but first we import the data and create a bed file setting the family id constant
plink2 \
--allow-extra-chr \
--bfile europe/output/file1 \
--make-bed \
--fa /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz \
--ref-from-fa 'force' `# sets REF alleles when it can be done unambiguously, we use force to change the alleles` \
--exclude europe/output/files/albopictus_SNPs_fail_segregation.txt \
--remove europe/output/files/duplicates_to_remove.txt \
--out europe/output/file1b \
--silent;
# --keep-allele-order \ if you use Plink 1.9
grep "samples\|variants" europe/output/file1b.log # to get the number of variants from the log file.

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 116475 variants loaded from europe/output/file1.bim. –exclude: 116475 variants remaining. –remove: 423 samples remaining. 423 samples (44 females, 1 male, 378 ambiguous; 423 founders) remaining after 116475 variants remaining after main filters. –ref-from-fa force: 0 variants changed, 116475 validated.

Using the default prior we obtained 107,294 SNPs. All the reference alleles matched the reference genome (AalbF3). We had no duplicates to remove

Check the headings of the the files we will work on.

head -n 5 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam
## SOC  1065    0   0   0   -9
## SOC  1066    0   0   0   -9
## SOC  1067    0   0   0   -9
## SOC  1068    0   0   0   -9
## SOC  1069    0   0   0   -9

Check how many samples are in the .fam file

wc -l /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam
## 423 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam

2.4. Checking the number of samples and localities

This part is very important. Make sure there are no NAs or something that is not what you would expect. For example, the number of mosquitoes per population.

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file1b.fam | sort | uniq -c | awk '{print $2, $1}' 
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BRE 13
## BUL 10
## CES 14
## CRO 12
## DES 17
## FRS 12
## GES 12
## GRA 12
## GRC 10
## IMP 4
## ITB 6
## ITP 10
## ITR 12
## KER 12
## KRA 12
## MAL 12
## POL 2
## POP 12
## RAR 12
## ROM 4
## ROS 12
## SCH 5
## SER 4
## SEV 12
## SIC 12
## SLO 12
## SOC 12
## SPB 8
## SPC 6
## SPM 6
## SPS 9
## STS 7
## TIK 12
## TIR 4
## TRE 12
## TUA 12
## TUH 12

3. Quality control steps

3.1. Missingness

plink2 \
--allow-extra-chr \
--bfile europe/output/file1b \
--geno 0.1                `# we set genotype missiningness to 10% with this option` \
--make-bed \
--out europe/output/file2 \
--silent \
--missing;                 # --missing produces sample-based and variant-based missing data reports. If run with --within/--family, the variant-based report is stratified by cluster.
grep "variants" europe/output/file2.log

116475 variants loaded from europe/output/file1b.bim. –geno: 8699 variants removed due to missing genotype data. 107776 variants remaining after main filters.

Make plot

#   ____________________________________________________________________________
#   import individual missingness                                           ####
indmiss <-                                              # name of the data frame we are creating
  read.delim(                                           # use function read table
    file   = here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file2.smiss"
    ),                                                  # we use library here to import file2.imiss from data/QC
    header = TRUE                                       # we do have headers in our file
  )
#   ____________________________________________________________________________
#   import SNP missingness                                                  ####
snpmiss <-
  read.delim(
    file   = here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file2.vmiss"
    ),
    header = TRUE
  )
#

Plot individual missingness

# load plotting theme
source(
  here(
    "my_theme2.R"
  )
)

ggplot( # Start a ggplot object with the data and aesthetic mappings
  indmiss,
  aes(
    x = F_MISS
  )
) +
  geom_histogram( # Add a histogram layer
    color            = "black",
    fill             = "lightgray",
    bins             = 6
  ) +
  geom_text(
    # Add text labels for bin counts
    stat             = "bin",
    aes(
    label = after_stat(count)
  ),
    vjust            = -0.5,
    color            = "black",
    size             = 3,
    bins             = 6
  ) +
  geom_vline(
    # Add a vertical line at the mean of F_MISS
    aes(
    xintercept = mean(F_MISS)),
    color            = "red",
    linetype         = "dotted",
    linewidth        = .5
  ) +
  geom_text(
    # Add a text label for the mean of F_MISS
    aes(
      x = mean(F_MISS),
      y = 75,
      label = paste0(
        "Mean \n",
        scales::percent(mean(F_MISS),
          accuracy = 0.01
        )
      )
    ),
    size = 3,
    color = "red",
    hjust = -.1
  ) +
  labs( # Add axis labels
    x                = "Individual Missingness (%)",
    y                = "Count (n)"
  ) +
  my_theme() +
  scale_x_continuous( # Scale the x-axis to display percentages
    labels           = scales::percent,
    n.breaks         = 6
  )

#
# save the plot
ggsave(
  here(
    "output", "europe", "figures" , "individual_missingness_europe.pdf"
  ),
  width              = 7,
  height             = 5,
  units              = "in"
)
# Define a function to customize the theme
my_theme <- function() {
  theme_minimal(base_size = 12, base_family = "") +
    theme(
      panel.grid.major = element_line(
        linetype = "dashed",
        linewidth = 0.2,
        color = "gray"
      ),
      panel.grid.minor = element_line(
        linetype = "dashed",
        linewidth = 0.2,
        color = "gray"
      ),
      # Customize the x-axis label
      axis.title.x = element_text(
        angle          = 0,
        hjust          = 1,
        face           = "bold"
      ),
      # Customize the y-axis label
      axis.title.y = element_text(
        angle          = 90,
        hjust          = 1,
        face           = "bold"
      )
    )
}
# we can save the function to source it later
dump(                                                    # check ?dump for more information
  "my_theme",                                            # the object we want to save
  here(
   "my_theme2.R")                # use here to save it our function as .R 
)

Plot variant missingness

# This plot takes a while to compute
# This code creates a histogram from the indmiss data set using the F_MISS column.
# ggplot builds a histogram of individual missingness data
ggplot(
  snpmiss,
  aes(
    x = F_MISS
  )
) +
  geom_histogram(
    color = "black",
    fill = "lightgray",
    bins = 6
  ) +
  stat_bin(
    geom = "text",
    aes(
      label = format(
        after_stat(count),
        big.mark = ",",
        scientific = FALSE
      )
    ),
    vjust = -0.5,
    color = "black",
    size = 2,
    bins = 6
  ) +
  geom_vline(
    aes(
      xintercept = mean(F_MISS)
    ),
    color = "red",
    linetype = "dotted",
    linewidth = 0.5
  ) +
  geom_text(
    aes(
      x = mean(F_MISS),
      y = 16000,
      label = paste0(
        "Mean \n",
        scales::percent(mean(F_MISS),
          accuracy = 0.01
        )
      )
    ),
    size = 3,
    color = "red",
    # hjust = 1.5,
    vjust = -.2
  ) +
  labs(
    x = "Variant Missingness (%)",
    y = "Count (n)"
  ) +
  # theme_minimal(
  #   base_size = 12,
  #   base_family = "Roboto Condensed"
  # ) +
  scale_x_continuous(
    labels = scales::percent,
    n.breaks = 6
  ) +
  scale_y_continuous(
    labels = scales::label_comma(),
    n.breaks = 5
  ) +
  my_theme()

# save the plot
ggsave(
  here(
    "output", "europe", "figures", "SNPs_missingness_europe.pdf"
  ),
  width  = 7,
  height = 5,
  units  = "in"
)

Remove individuals missing more than 20% of SNPs. You can use the threshold you want, change the flag –mind

plink2 \
--allow-extra-chr \
--bfile europe/output/file2 \
--mind 0.2               `# here we set the individual missingness threshold of 20%`\
--make-bed \
--out europe/output/file3 \
--silent;
grep "samples\|variants" europe/output/file3.log

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 107776 variants loaded from europe/output/file2.bim. 0 samples removed due to missing genotype data (–mind). 423 samples (44 females, 1 male, 378 ambiguous; 423 founders) remaining after

We did not remove any SNP due to individual missingness

3.2. Minor allele frequency

First, we estimate the allele frequency with Plink.

plink2 \
--allow-extra-chr \
--bfile europe/output/file3 \
--freq \
--out europe/output/MAF_check \
--silent

Now we apply the MAF filter.

# We will use MAF of 10%
plink2 \
--allow-extra-chr \
--bfile europe/output/file3 \
--maf 0.1 \
--make-bed \
--out europe/output/file4 \
--silent;
grep "variants" europe/output/file4.log

107776 variants loaded from europe/output/file3.bim. 22470 variants removed due to allele frequency threshold(s) 85306 variants remaining after main filters.

Then we plot it with ggplot.

#   Import MAF data                                                         ####
maf_freq <-
  read.delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/MAF_check.afreq"
    ),
    header = TRUE
  )

Make MAF plot

#   make the plot                                                           ####
ggplot(
  maf_freq,
  aes(ALT_FREQS)
) +
  geom_histogram(
    colour = "black",
    fill = "lightgray",
    bins = 40
  ) +
  labs(
    x = "Minor Allele Frequency (MAF)",
    y = "Count (n)",
    caption = "<span style='color:red;'><i>Red</i></span> <span style='color:black;'><i>line at</i></span><span style='color:red;'><i> MAF 10%</i></span><span style='color:black;'><i> threshold</i></span>."
  ) +
  geom_text(
    aes(
      x = .1,
      y = 8000,
      label = paste0("22,470 SNPs")
    ),
    size = 3,
    color = "red",
    vjust = -.2
  ) +
  geom_vline(xintercept = 0.1, color = "red") +
  my_theme() +
  theme(plot.caption = element_markdown()) +
  scale_y_continuous(label = scales::number_format(big.mark = ",")) +
  scale_x_continuous(breaks = c(0, 0.1, 0.2, 0.4, 0.6, 0.8, 1))

# save the plot
ggsave(
  here(
    "output", "europe", "figures", "MAF_freq_plot_europe.pdf"
  ),
  width  = 7,
  height = 5,
  units  = "in"
)

We removed 22,470 variants due to the MAF filter. Next we will excludes markers which deviate from Hardy–Weinberg equilibrium (HWE). It is a common indicator of genotyping error, but may also indicate evolutionary selection. We have to do it for each population individually. We cannot do it for all populations at once. Therefore, the first step is create a new bed file with Plink keeping only one population. I like to create a new directory and name it “hardy”, and copy the “file4” there.

mkdir -p europe/output/files/hardy;
cp europe/output/file4.* europe/output/files/hardy/

3.3. HWE test

Now we can run the HWE test. However, we will need to apply the SNP missingness again for each population. If we do not, the HWE will vary widely. With the bash script below, we will create a new file for each population, run the HWE test with HWE p value <1e‐6 (HWE p value <1e‐6). Then, we ask Plink to generate a list of SNPs that passed the test for each population.

for fam in $(awk '{print $1}' europe/output/files/hardy/file4.fam | sort | uniq); 
do 
echo $fam | \
plink2 \
--allow-extra-chr \
--silent \
--keep-allele-order \
--bfile europe/output/files/hardy/file4 \
--keep-fam /dev/stdin \
--make-bed \
--out europe/output/files/hardy/$fam \
--hwe 0.000001 \
--geno 0.1 \
--write-snplist; \
done

Next, we use “cat” and “awk” to concatenate the SNP list from all populations, and remove duplicates. Once we have a list of SNPs that passed the test for each population, we can use Plink to create a new bed file keeping only the SNPs that passed the test in each population. First, lets get the list of SNPs, and count how many passed:

cat europe/output/files/hardy/*.snplist | awk '!a[$0]++' > europe/output/files/passed_hwe.txt;
wc -l europe/output/files/passed_hwe.txt

85306 europe/output/files/passed_hwe.txt

How many variants we had before

cat /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/hardy/file4.bim | awk '{print $2}'| awk '!a[$0]++' | wc -l
## 85306

All 85,306 variants passed HWE test. If some failed, next time we could remove the variants that did not pass HWE test, using the –extract flag, extracting only those that passed HWE.

3.4. Linkage analysis

We can analyse the LD patterns for each populations using different approaches. If we want to look at general patterns we can get an estimate for each population, for example, the half of the distance of the r2 max. We can use Plink2 to estimates some statistics for us. One important consideration to is how good is our genome assembly. Does it matter if we use the 574 scaffolds or assign each scaffold to a chromosome or not. So, if you want to know if it matters, you would need to create a new chromosomal scale and perform the calculations. Since, we want to get only an general approximate estimate of the linkage levels, we can look at the largest scaffolds. Since chromosome 1 has the M locus, we will get the largest scaffold of chromosome 2 and 3. We create a new chromosomal scale when we want to plot the data. For now, we can import a file that I created with the size of each scaffold (data/genome/scaffold_sizes.txt). The code below imports the file and get the 3 largest scaffolds from each chromosome.

#   import the file with the scaffold sizes                                 ####
scaffold_bps <-
  read_delim(
    here(
      "scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cn"
  )
#
# set column names
colnames(
  scaffold_bps
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <-
  scaffold_bps |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1",              # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  )
#
# group by Chromosome and get row with highest Size value
df_max <-
  sizes |>
  group_by(
    Chromosome
  ) |>
  slice_max(
    Size
  )
#
# print result
df_max |>
  mutate(
    Size_Mb = Size/1e6
  )
## # A tibble: 3 × 4
## # Groups:   Chromosome [3]
##   Scaffold     Size Chromosome Size_Mb
##   <chr>       <dbl> <chr>        <dbl>
## 1 1.85     28902815 1             28.9
## 2 2.17     43762705 2             43.8
## 3 3.75     25123842 3             25.1

The largest scaffolds of chromosome 2 is 43Mb, and chromosome 3 is 25Mb. We can also take a look at the size distribution of the scaffolds using ggplot2.

# create histogram using ggplot2
ggplot(
  sizes,
  aes(
    x = Size
  )
) +
  geom_histogram(
    binwidth = 1e6,
    aes(fill = factor(
      Chromosome
    )),
    color = "lightgray"
  ) +
  labs(
    title = "Scaffold sizes per chromosome",
    x = "Scaffold Size (Mb)",
    y = "Count (n)"
  ) +
  # xlab("Scaffold Size (Mb)") +
  # ylab("Count (n)") +
  my_theme() +
  scale_x_continuous(
    labels = comma_format(
      scale = 1e-6
    )
  ) +
  facet_wrap(
    ~Chromosome,
    ncol = 3,
    scales = "free_x"
  ) +
  scale_fill_manual(
    values = c(
      "pink", "green", "orange"
    )
  ) +
  guides(fill = "none")

Now, we can create a list of SNPs from these two large scaffolds. We hope they will give us an average LD statistic that represents the entire genome. We can import the .bim file and get the list of variants from scaffolds 2.17 (43Mb) and 3.75 (25Mb)

Import the .bim file with the SNPs

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file4.bim" #output/populations/file4.bim
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-583035163     0   315386 A       G      
## 2 1.1      AX-583033356     0   315674 C       T      
## 3 1.1      AX-583033370     0   330057 G       A      
## 4 1.1      AX-583035194     0   330265 A       G      
## 5 1.1      AX-583033387     0   331288 C       T      
## 6 1.10     AX-583035257     0    91677 T       C

We can write a function to import the bim files.

#   function to import bim files                                            ####
#
import_bim <- function(file) {
  # import as a tibble and set columns as integers
  bim <-
    read_delim(
      file,
      col_names      = FALSE,
      show_col_types = FALSE,
      col_types      = "ccidcc"
    )
  # rename the columns by index
  bim <- bim |>
    rename(
      Scaffold = 1,
      SNP      = 2,
      Cm       = 3,
      Position = 4,
      Allele1  = 5,
      Allele2  = 6
    )
  return(bim)
}
# we can save the function to source it later
dump(                                                     # check ?dump for more information
  "import_bim",                                           # the object we want to save
  here(
    "analyses", "import_bim.R")       # use here to save it our function as .R 
)

Now we can create the list of SNPs we want to use for the general linkage analysis.

snps_4ld <-
  snps |>
  filter( # scaffolds 2.17 (43Mb) and 3.75 (25Mb)
    Scaffold == "2.17" | Scaffold == "3.75"
  ) |> 
  select(
    "Scaffold",
    "SNP"
  )
nrow(snps_4ld)
## [1] 5689

We have 5689 SNPs to estimate LD that are located in the two largest scaffolds of chromosome 2 and 3.

#   save calculation to load later                                          ####
write.table(
  snps_4ld,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/snps_4ld.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Now we have to select the populations for which we want to estimate the LD statistic. Let only estimate LD for populations that we have 12 or more individuals.

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file4.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 12 {print}'
## ALU 12
## ALV 12
## BAR 12
## BRE 13
## CES 14
## CRO 12
## DES 17
## FRS 12
## GES 12
## GRA 12
## ITR 12
## KER 12
## KRA 12
## MAL 12
## POP 12
## RAR 12
## ROS 12
## SEV 12
## SIC 12
## SLO 12
## SOC 12
## TIK 12
## TRE 12
## TUA 12
## TUH 12

Now we can create a list of populations that have 12 or more individuals. We can use the command above but only print the list of families. We can create a new directory for the LD analysis.

# make directory
mkdir -p /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld

Create list of populations

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file4.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 12 {print}' | awk '{print $1}' > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/pops_4ld.txt

3.4.1. Half distance r^2 max

Now we can use Plink1.9 to estimate LD and then bin the data

#load plink 1.9
module load PLINK/1.9b_6.21-x86_64
plink --version
#PLINK v1.90b6.21 64-bit (19 Oct 2020)
for fam in $(awk '{print $1}' europe/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--extract europe/output/files/ld/snps_4ld.txt \
--bfile europe/output/file4 \
--keep-fam /dev/stdin \
--maf 0.1 \
--r2 \
--out europe/output/files/ld/$fam \
--geno 0.2 \
--write-snplist \
--silent
done;
#
rm europe/output/files/ld/*.nosex

We will use only variants that are present in at least 50% of the populations. Once we subset the bed files per population, we set a MAF and genotype missingness of 10% as before but now we have smaller sample size and not all populations will have all the variants.

import os

# Replace with the directory containing the .snplist files
directory = "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld"

# Get a list of all .snplist files in the directory
file_list = [filename for filename in os.listdir(directory) if filename.endswith(".snplist")]

# Create a set to store the unique variants
unique_variants = set()

# Loop over each file and add the variants to the set
for filename in file_list:
    with open(os.path.join(directory, filename), "r") as file:
        for line in file:
            unique_variants.add(line.strip())

# Create a dictionary to store the number of files each variant appears in
variant_counts = {variant: 0 for variant in unique_variants}

# Loop over each file again and count the variants
for filename in file_list:
    with open(os.path.join(directory, filename), "r") as file:
        for line in file:
            variant = line.strip()
            variant_counts[variant] += 1

# Calculate the threshold for the number of files a variant must appear in
threshold = int(len(file_list) * 0.5)

# Get the list of unique variants that appear in at least 80% of files
all_variants = [variant for variant, count in variant_counts.items() if count >= threshold]

# Save the list of variants to a new file
with open("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/all_unique_variants_50pct.txt", "w") as file:
    file.write("\n".join(all_variants))
    
#66013
## 66013

Check how many SNPs

cat /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/all_unique_variants_50pct.txt | sort | awk '!seen[gensub(/^[[:space:]]+|[[:space:]]+$/, "", "g")]++' | wc -l
## 5078

Now we can estimate LD again using the 5,078 variants that are present in at least 50% of the populations.

Now we can use Plink1.9 to estimate LD

for fam in $(awk '{print $1}' europe/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--extract europe/output/files/ld/all_unique_variants_50pct.txt \
--bfile europe/output/file4 \
--keep-fam /dev/stdin \
--maf 0.1 \
--r2 \
--out europe/output/files/ld/$fam \
--geno 0.2 \
--write-snplist \
--silent
done;
#
rm europe/output/files/ld/*.nosex

Lets check one of the files

head /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/ALU.ld
##  CHR_A BP_A SNP_A CHR_B BP_B SNP_B R2 
##  2.17 56405 AX-584670672 2.17 70977 AX-584670684 0.571429 
##  2.17 56405 AX-584670672 2.17 71211 AX-585179355 0.308571 
##  2.17 56405 AX-584670672 2.17 71454 AX-584670696 0.25 
##  2.17 56405 AX-584670672 2.17 71656 AX-585179364 0.571429 
##  2.17 56405 AX-584670672 2.17 71887 AX-584670706 0.571429 
##  2.17 56405 AX-584670672 2.17 72138 AX-584670713 0.326531 
##  2.17 56405 AX-584670672 2.17 87654 AX-584670822 0.285714 
##  2.17 56405 AX-584670672 2.17 115267 AX-585179548 0.344234 
##  2.17 56405 AX-584670672 2.17 124491 AX-585179568 0.232126

Clean the LD files from Plink

ld_files <- list.files(path = "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld", pattern = "\\.ld$", full.names = TRUE)

for (file in ld_files) {
  system(paste("awk -i inplace '{gsub(/[[:blank:]]+/, \" \")}1'", file))
}

Lets check one of the files

head /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/ALU.ld
##  CHR_A BP_A SNP_A CHR_B BP_B SNP_B R2 
##  2.17 56405 AX-584670672 2.17 70977 AX-584670684 0.571429 
##  2.17 56405 AX-584670672 2.17 71211 AX-585179355 0.308571 
##  2.17 56405 AX-584670672 2.17 71454 AX-584670696 0.25 
##  2.17 56405 AX-584670672 2.17 71656 AX-585179364 0.571429 
##  2.17 56405 AX-584670672 2.17 71887 AX-584670706 0.571429 
##  2.17 56405 AX-584670672 2.17 72138 AX-584670713 0.326531 
##  2.17 56405 AX-584670672 2.17 87654 AX-584670822 0.285714 
##  2.17 56405 AX-584670672 2.17 115267 AX-585179548 0.344234 
##  2.17 56405 AX-584670672 2.17 124491 AX-585179568 0.232126

Function to import LD data

import_ld_files <- function(directory) {
  ld_files <-
    list.files(
      path = directory,
      pattern = "\\.ld$",
      full.names = TRUE
    )

  ld_tibbles <- list()

  for (file in ld_files) {
    ld_name <- gsub(".ld", "", basename(file))
    ld_data <- read_delim(
      file,
      col_names = TRUE,
      delim = " ",
      show_col_types = FALSE
    ) %>%
      select(c("CHR_A", "BP_A", "SNP_A", "CHR_B", "BP_B", "SNP_B", "R2"))
    ld_tibbles[[ld_name]] <- ld_data
  }

  return(ld_tibbles)
}

# we can save the function to source it later
dump(                                                     # check ?dump for more information
  "import_ld_files",                                           # the object we want to save
  here(
    "analyses", "import_ld_files.R")       # use here to save it our function as .R 
)

Import the LD data

my_ld_tibbles <- import_ld_files("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/")


head(my_ld_tibbles$ALV)
## # A tibble: 6 × 7
##   CHR_A  BP_A SNP_A        CHR_B   BP_B SNP_B           R2
##   <dbl> <dbl> <chr>        <dbl>  <dbl> <chr>        <dbl>
## 1  2.17 70977 AX-584670684  2.17 161344 AX-584670951 0.424
## 2  2.17 70977 AX-584670684  2.17 193047 AX-584670995 0.257
## 3  2.17 70977 AX-584670684  2.17 201334 AX-585179663 0.490
## 4  2.17 70977 AX-584670684  2.17 213672 AX-584671067 0.5  
## 5  2.17 70977 AX-584670684  2.17 213933 AX-585179740 1    
## 6  2.17 70977 AX-584670684  2.17 217433 AX-584671128 0.5

Estimate half distance of maximum R2 value.

Since we have several populations, it is better to write a function

get_half_pop <- function(ld_tibble) {
  # Split the tibble into sub-tibbles by scaffold
  ld_split <- split(ld_tibble, ld_tibble[, 1])
  # Remove rows with missing data
  ld_no_nan <- lapply(ld_split, function(x) {
    x[complete.cases(x), ]
  })
  # Calculate the distance between SNPs
  ld_dist <-
    lapply(ld_no_nan, function(x) {
      cbind(x, x[, 5] - x[, 2])
    })
  # Sort the sub-tibbles by distance
  ld_dist_order <- lapply(ld_dist, function(x) {
    x[order(x[, 8]), ]
  })
  # Combine the sub-tibbles into one tibble
  ld_proc <- do.call(rbind, ld_dist_order)
  # Sort the tibble by distance
  ld_sorted <- ld_proc[order(ld_proc[, 8]), ]
  # Bin the data by distance
  bin.1000 <-
    cut(ld_sorted[, 8], (c(0:1000) * 1000), include.lowest = TRUE)
  # Calculate the mean R2 for each bin
  ld_bin_1000_sorted <-
    tapply(ld_sorted[, 7], bin.1000, function(x) {
      mean(x)
    })
  # Fit a LOESS curve to the binned data
  lo.1000.ld <- loess(ld_bin_1000_sorted ~ c(1:1000))
  # Find the value of the binned R2 that corresponds to half the maximum R2 value
  d.pop <- max(ld_bin_1000_sorted, na.rm = TRUE)
  half.pop <- which(ld_bin_1000_sorted <= (d.pop / 2))[1]
  return(half.pop)
}
# we can save the function to source it later
dump(
  "get_half_pop",
  here(
    "analyses", "get_half_pop.R")
)

Test the function, it should return the same value as we got above

# for one population, you can change the population name to see other populations
half_alv <- get_half_pop(my_ld_tibbles$ALV)
half_alv
## [0,1e+03] 
##         1

Let’s estimates for all populations

# Create an empty data.frame to store the results
half_pops_df <-
  data.frame(
    row.names = names(my_ld_tibbles),
    half_distance = numeric(length(my_ld_tibbles)),
    stringsAsFactors = FALSE
  )

# Calculate the hald distance of max r^2 value for each tibble and store in the data.frame
for (tib_name in names(my_ld_tibbles)) {
  tib_half <- get_half_pop(my_ld_tibbles[[tib_name]])
  half_pops_df[tib_name, "half_distance"] <- tib_half
}

# Print the resulting data.frame
half_pops_df
##     half_distance
## ALU             5
## ALV             1
## BAR             1
## BRE             3
## CES             2
## CRO             2
## DES             2
## FRS             1
## GES             2
## GRA             1
## ITR             1
## KER             3
## KRA             1
## MAL             1
## POP             2
## RAR             2
## ROS             1
## SEV             2
## SIC             1
## SLO             1
## SOC             2
## TIK             3
## TRE             1
## TUA             1
## TUH             1

The half distance of the maximum r^2 for all populations vary from 1 to 5kb when we bin the data. We do see larger blocks but most of the ld blocks are small. You can use the entire genome to estimate LD instead of the larger scaffolds, but the result will be almost the same. I have done it, but I included only the larger scaffolds to make it easier for anyone to repeat the analysis. Once we start using the entire genome, we need to set a LD window, otherwise we will have billions of estimates since we would have to estimate r^2 for all pairs of SNPs on each chromosome. Let’s say we would have 500 SNPs on chromosome 1, we would have 2^500, which is 3.273391e+150 pairwise estimates.

Let’s make a plot for all populations

# Calculate half distances for each population
populations <- names(my_ld_tibbles)
half_distances <- sapply(populations, function(population) {
  ld_tibble <- my_ld_tibbles[[population]]
  half_pop <- get_half_pop(ld_tibble)
  return(half_pop)
})

# Create a tibble to store the half distances
half_distance_tibble <- data.frame(
  population = populations,
  half_distance = half_distances
)

# set the order of populations, you can change it on Excel then use it as a vector name pop_order
pop_order <-
  c(
    unique(
      half_distance_tibble$population
    )
  )

# set the order by continent (Africa, Americas, Asia, Europe) - check Excel spreadsheet named Pops_ld_order.xlxs or make your own. For example, range or continent.
half_distance_tibble$population <-
  factor(half_distance_tibble$population, levels = pop_order)

# Plot the half distances for each population
ggplot(half_distance_tibble, aes(x = population, y = half_distance)) +
  geom_bar(
    stat = "identity",
    fill = "lightgray",
    alpha = 0.8
  ) +
  geom_text(
    aes(label = round(half_distance, 2)),
    vjust = -0.5,
    size = 4,
    color = "red"
  ) +
  labs(x = "Population", y = expression(bold("Half Distance of Max " ~ r^
    2 ~ " (kb)"))) +
  my_theme() +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 14
    ),
    axis.text.y = element_text(size = 14),
    axis.title.x = element_text(size = 16, face = "bold"),
    axis.title.y = element_text(size = 16, face = "bold")
  )

# save the plot
ggsave(
  here(
    "output", "europe", "figures", "linkage_half_distance_LONG_scaffolds_europe.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in"
)

We can also plot the distribution of LD blocks across all scaffolds. Later we will create a new chromosomal scale, but for now lets estimate LD blocks using the genome as it is.

3.4.2. Estimate LD blocks using scaffolds

mkdir -p /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks

We can use Plink1.9 to estimate LD blocks for the populations with more than 12 individuals. We will use the entire genome for this part instead of the larger scaffolds only. We will set max distance of LD blocks of 200kb. We found out that the average half distance of r^2 max is small, from 1 to 5kb

for fam in $(awk '{print $1}' europe/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile europe/output/file4 \
--keep-fam /dev/stdin \
--maf 0.1 \
--blocks no-pheno-req \
--blocks-max-kb 200 \
--out europe/output/files/ld/blocks/$fam \
--geno 0.1 \
--silent
done;
#
rm europe/output/files/ld/blocks/*.nosex

Now we can get some data from our .log files

echo "Population,n,nVariants,geno,maf,passQC" > europe/output/files/ld/blocks/table_ld_stats.csv
for file in europe/output/files/ld/blocks/*.log
do
  variants=$(grep -oE '([0-9]+) variants loaded from \.bim file' $file | grep -oE '[0-9]+')
  geno=$(grep -oE '([0-9]+) variants removed due to missing genotype data \(--geno\)' $file | grep -oE '[0-9]+')
  maf=$(grep -oE '([0-9]+) variants removed due to minor allele threshold\(s\)' $file | grep -oE '[0-9]+')
  pass=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | head -1)
  n=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | tail -1)
  filename=$(basename $file .log)
  echo "$filename,$n,$variants,$geno,$maf,$pass" >> europe/output/files/ld/blocks/table_ld_stats.csv
done;
head -n 5 europe/output/files/ld/blocks/table_ld_stats.csv

Population,n,nVariants,geno,maf,passQC ALU,12,85306,5169,20672,59465 ALV,12,85306,4412,19004,61890 BAR,12,85306,5963,21916,57427 BRE,13,85306,10344,19831,55131

We can check the it out

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks/table_ld_stats.csv"
    ),
  header = TRUE,
  sep = ","
)

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 1: Summary of quality control for population data.")

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("output", "europe", "summary_maf_geno_filter_population_europe.docx"))

ft

Table 1: Summary of quality control for population data.

Population

n

nVariants

geno

maf

passQC

ALU

12

85,306

5,169

20,672

59,465

ALV

12

85,306

4,412

19,004

61,890

BAR

12

85,306

5,963

21,916

57,427

BRE

13

85,306

10,344

19,831

55,131

CES

14

85,306

15,654

22,004

47,648

CRO

12

85,306

4,515

18,678

62,113

DES

17

85,306

18,355

16,512

50,439

FRS

12

85,306

3,714

15,896

65,696

GES

12

85,306

4,107

22,962

58,237

GRA

12

85,306

5,964

17,923

61,419

ITR

12

85,306

3,611

16,391

65,304

KER

12

85,306

3,938

19,589

61,779

KRA

12

85,306

6,537

18,496

60,273

MAL

12

85,306

3,856

15,217

66,233

POP

12

85,306

3,944

13,767

67,595

RAR

12

85,306

4,049

24,005

57,252

ROS

12

85,306

6,254

19,530

59,522

SEV

12

85,306

4,424

25,210

55,672

SIC

12

85,306

8,629

20,046

56,631

SLO

12

85,306

6,840

13,586

64,880

SOC

12

85,306

4,212

22,432

58,662

TIK

12

85,306

4,125

24,999

56,182

TRE

12

85,306

3,683

12,556

69,067

TUA

12

85,306

5,130

19,036

61,140

TUH

12

85,306

4,663

16,952

63,691

Now we can count how many blocks we found in each population

cd /gpfs/gibbs/pi/caccone/mkc54/albo

wc -l europe/output/files/ld/blocks/*.blocks | \
  awk '{population = gensub(/\.blocks/, "", "g", $2); print population "\t" $1}' | \
  sed 's#europe/output/files/ld/blocks/##' | \
  sed '$d' > europe/output/files/ld/blocks/populations_block_counts.csv;
head -n 30 europe/output/files/ld/blocks/populations_block_counts.csv
## ALU  88
## ALV  89
## BAR  113
## BRE  316
## CES  399
## CRO  104
## DES  532
## FRS  70
## GES  183
## GRA  29
## ITR  95
## KER  138
## KRA  91
## MAL  76
## POP  77
## RAR  174
## ROS  89
## SEV  222
## SIC  168
## SLO  40
## SOC  136
## TIK  216
## TRE  82
## TUA  136
## TUH  132

Now we can add the number of blocks to the table we made

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks/table_ld_stats.csv"
    ,
  header = TRUE,
  sep = ","
)

# Load the population counts data from the CSV file
pop_counts <-
  read.delim(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks/populations_block_counts.csv"
    ,
    header = F,
    sep = "\t"
  ) |>
  rename(
    Population       = 1,
    nBlocks           = 2
  )

# Merge the population counts with the table data
ld_blocks <- merge(ld_blocks, pop_counts, by = "Population")

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 2: Number of linkage blocks detected with Plink for populations with at least 12 individuals.")

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("output", "europe", "summary_ld_blocks_europe.docx"))

ft

Table 2: Number of linkage blocks detected with Plink for populations with at least 12 individuals.

Population

n

nVariants

geno

maf

passQC

nBlocks

ALU

12

85,306

5,169

20,672

59,465

88

ALV

12

85,306

4,412

19,004

61,890

89

BAR

12

85,306

5,963

21,916

57,427

113

BRE

13

85,306

10,344

19,831

55,131

316

CES

14

85,306

15,654

22,004

47,648

399

CRO

12

85,306

4,515

18,678

62,113

104

DES

17

85,306

18,355

16,512

50,439

532

FRS

12

85,306

3,714

15,896

65,696

70

GES

12

85,306

4,107

22,962

58,237

183

GRA

12

85,306

5,964

17,923

61,419

29

ITR

12

85,306

3,611

16,391

65,304

95

KER

12

85,306

3,938

19,589

61,779

138

KRA

12

85,306

6,537

18,496

60,273

91

MAL

12

85,306

3,856

15,217

66,233

76

POP

12

85,306

3,944

13,767

67,595

77

RAR

12

85,306

4,049

24,005

57,252

174

ROS

12

85,306

6,254

19,530

59,522

89

SEV

12

85,306

4,424

25,210

55,672

222

SIC

12

85,306

8,629

20,046

56,631

168

SLO

12

85,306

6,840

13,586

64,880

40

SOC

12

85,306

4,212

22,432

58,662

136

TIK

12

85,306

4,125

24,999

56,182

216

TRE

12

85,306

3,683

12,556

69,067

82

TUA

12

85,306

5,130

19,036

61,140

136

TUH

12

85,306

4,663

16,952

63,691

132

Get the size of each block from the .block.det files

get_kb_column <- function(dir_path) {
  # obtain the list of files with extension .blocks.det
  file_names <- list.files(path = dir_path, pattern = "\\.blocks\\.det$", full.names = TRUE)
  
  # create an empty list to hold the data frames
  block_list <- list()
  
  # loop through the files and read the data into the list
  for (file in file_names) {
    df <- read.table(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
    # select only the KB column and add it to the block_list with the file name
    block_list[[file]] <- df %>% select(KB) %>% add_column(file = file, .before = 1)
  }
  
  # combine the data frames in the block_list into a single data frame
  blocks <- bind_rows(block_list)
  
  # clean up the file name column
  blocks$file <- str_remove(blocks$file, "^.*\\/ld\\/blocks\\/")
  
  return(blocks)
}

# example usage: replace dir_path with your directory path
dir_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks")

blocks<- 
  get_kb_column(dir_path) |>
  mutate(file = str_remove(file, ".blocks.det")) |>
  as_tibble() |>
  rename(
    Population = 1
  )

Create density plot of the size of the LD blocks Plink found

# to check how many colors we need
# n_distinct(blocks$Population) 
#25

# define the color palette with 30 color blind colors
colors <- 
  c("#52ef99", "#146c45", "#75d5e1", "#2c4a5e", "#6a8fe0", "#8c61cd", "#f365e7", "#871550", "#f6c8de", "#a113b2", "#cf749b", "#2524f9",  "#cddb9b", "#799d10", "#a7e831", "#754819", "#fda547", "#a41415", "#fd5917", "#fd4e8b", "#ead624", "#21a708", "#332288", "#51f310", "#9d8d88")


# make plot using the sample y scale for all populations
ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = colors) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3) + my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here(
    "output",
    "europe",
    "figures",
    "block_density_y_scale_fixed_europe.pdf"
  ),
  width = 10,
  height = 10,
  units = "in"
)

Make plot allowing the y axis scale free

ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = colors) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3, scales = "free_y") +
  my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here(
    "output",
    "europe",
    "figures",
    "block_density_free_scale_europe.pdf"
  ),
  width = 20,
  height = 18,
  units = "in"
)

3.5. LD pruning

Since we do not have to remove any SNPs due to deviation from HWE, we can proceed with heterozygosity estimates. The first step is to “prune” our data set. We will check the pairwise linkage estimates for all SNPs. We can work with file4. We will use “indep-pairwise” to check if there are SNPs above a certain linkage disequilibrium (LD) threshold. Check Plink documentation for more details https://www.cog-genomics.org/plink/1.9/ld I used “--indep-pairwise 5 1 0.1” , which indicates according to the documentation: --indep-pairphase <window size>['kb'] <step size (variant ct)> <r^2 threshold> We will check in a window of 5kb if there is any pair of SNPs with r2 estimates above 0.1, then we will move our window 1 SNP and check again for SNPs above the threshold. We will repeat this procedure until we check the entire genome.

Need to switch back to plink2

module load PLINK/2_avx2_20221024
#PLINK/1.9b_6.21-x86_64 => PLINK/2_avx2_20221024
plink2 --version
plink2 \
--allow-extra-chr \
--bfile europe/output/file4 \
--extract europe/output/files/passed_hwe.txt \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP.log

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 85306 variants loaded from europe/output/file4.bim. –extract: 85306 variants remaining. 85306 variants remaining after main filters. –indep-pairwise (3 compute threads): 37710/85306 variants removed.

Remember, the SNPs are not removed from our data set. Plink created 3 files when we ran the code above. One is the “indepSNP.log” file, and the other two are: “indepSNP.prune.in” -> list of SNPs with squared correlation smaller than our r2 threshold of 0.1. “indepSNP.prune.out” -> list of SNPs with squared correlation greater than our r2 threshold of 0.1. For our heterozygosity estimates, we want to use the set of SNPs that are below our r2 threshold of 0.1. We consider that they are randomly associated. We can use Plink to estimate the heterozygosity using the “indepSNP.prune.in” file.

3.6. Heterozygosity

plink2 \
--allow-extra-chr \
--bfile europe/output/file4 \
--extract europe/output/indepSNP.prune.in \
--het \
--out europe/output/R_check \
--silent;
grep 'variants' europe/output/R_check.log

85306 variants loaded from europe/output/file4.bim. –extract: 47596 variants remaining. 47596 variants remaining after main filters.

#   find individuals with high heterozygosity                              ####
# import the data from Plink
het <- read.delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/R_check.het"
  ),
  head = TRUE
)
#
# check head of the file
colnames(het)
## [1] "X.FID"  "IID"    "O.HOM." "E.HOM." "OBS_CT" "F"

Estimate heterozygosity

# create a column named HET_RATE and calculate the heterozygosity rate
het$HET_RATE <- (het$"OBS_CT" - het$"O.HOM") / het$"OBS_CT"
#
# use subset function to get values deviating from 4sd of the mean heterozygosity rate.
het_fail <-
  subset(
    het, (het$HET_RATE < mean(
      het$HET_RATE
    ) - 4 * sd(
      het$HET_RATE
    )) |
      (het$HET_RATE > mean(
        het$HET_RATE
      ) + 4 * sd(
        het$HET_RATE
      ))
  )
#
# get the list of individuals that failed our threshold of 4sd from the mean.
het_fail$HET_DST <-
  (het_fail$HET_RATE - mean(
    het$HET_RATE
  )) / sd(
    het$HET_RATE
  )

Save the files to use with Plink

#   save the data to use with Plink2                                        ####
#
write.table(
  het_fail,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fail-het-qc.txt"
  ),
  row.names = FALSE
)

Make plot

#   make a heterozygosity plot                                              ####
#
ggplot(
  het,
  aes(
    HET_RATE
  )
) +
  geom_histogram(
    colour           = "black",
    fill             = "lightgray",
    bins             = 40
  ) +
  labs(
    x                = "Heterozygosity Rate",
    y                = "Number of Individuals"
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      )
    ),
    col              = "red",
    linewidth        = 1.5
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) + 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) - 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) + 
  my_theme() +
  scale_y_continuous(
  )

#   save the heterozygosity plot                                            ####
ggsave(
  here(
    "output", "europe", "figures", "Heterozygosity_europe.pdf"
  ),
  width  = 5,
  height = 4,
  units  = "in"
)

The red line in the plot above indicates the mean, and the gray lines indicate 4 standard deviation from the mean. We can see that some mosquitoes do have excess heterozygous sites. We will remove them. We can get their ID from the file “fail-het-qc.txt”. We can use the bash script below to parse the file to use with Plink

cd /gpfs/gibbs/pi/caccone/mkc54/albo
sed 's/"// g' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fail-het-qc.txt | awk '{print$1, $2}'> europe/output/het_fail_ind.txt;
echo 'How many mosquitoes we need to remove from our data set:';
cat /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/het_fail_ind.txt | tail -n +2 | wc -l;
echo 'Which mosquitoes we have to remove:';
tail -n +2 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/het_fail_ind.txt
## How many mosquitoes we need to remove from our data set:
## 5
## Which mosquitoes we have to remove:
## GRA 734
## TUA 783
## ITP 829
## ITP 832
## ROS 858

Heterozygosity is outside the SD threshold in 2 ITP individuals (829, 832), while GRA (734), TUA (783), and ROS (858) each have 1. We will remove these 5 individuals.

Next, we will remove these mosquitoes from our data set using Plink:

plink2 \
--allow-extra-chr \
--bfile europe/output/file4 \
--remove europe/output/het_fail_ind.txt \
--make-bed \
--out europe/output/file5 \
--silent;
grep 'variants\|samples' europe/output/file5.log

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 85306 variants loaded from europe/output/file4.bim. –remove: 418 samples remaining. 418 samples (44 females, 1 male, 373 ambiguous; 418 founders) remaining after

3.7. Relatedness

Check for cryptic relatedness. Check Plink2 documentation https://www.cog-genomics.org/plink/2.0/distance You can download King directly https://www.kingrelatedness.com/manual.shtml or check their manuscript https://www.ncbi.nlm.populations.gov/pmc/articles/PMC3025716/pdf/btq559.pdf From Plink2 documentation: “Note that KING kinship coefficients are scaled such that duplicate samples have kinship 0.5, not 1. First-degree relations (parent-child, full siblings) correspond to ~0.25, second-degree relations correspond to ~0.125, etc. It is conventional to use a cutoff of ~0.354 (the geometric mean of 0.5 and 0.25) to screen for monozygotic twins and duplicate samples, ~0.177 to add first-degree relations, etc.” There two options. One is to run only –make-king and another one is to use –make-king-table We will use the threshold of 0.354 and create a table.

# Plink2 will create a file with extension .king
plink2 \
--allow-extra-chr \
--bfile europe/output/file5 \
--extract europe/output/indepSNP.prune.in \
--make-king-table rel-check \
--king-table-filter 0.354 \
--out europe/output/file5 \
--silent;
grep 'variants\|samples' europe/output/file5.log

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 85306 variants loaded from europe/output/file4.bim. –extract: 47596 variants remaining. 47596 variants remaining after main filters.

Check the individuals that did not pass our filtering.

head /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file5.kin0
## #FID1    IID1    FID2    IID2    NSNP    HETHET  IBS0    KINSHIP
## DES  1445    DES 1292    45918   0.266954    0.00511782  0.441964
## ITB  748 ITB 747 46227   0.268479    0.0123521   0.398236
## SIC  1231    SIC 1229    44489   0.273933    0.0036863   0.45193
## SIC  1235    SIC 1233    44896   0.259132    0.00779579  0.403776
## SIC  1235    SIC 1234    44365   0.241113    0.0116082   0.362336
## SIC  1236    SIC 1234    44586   0.247858    0.0104966   0.377416
## SPM  767 SPM 765 44490   0.253383    0.00651832  0.354426
## SPS  915 SPS 914 46033   0.225577    0.0120131   0.362151
## TUA  779 TUA 776 45693   0.245355    0.0147287   0.369513

We want to remove one of the individuals of the pairs.

plink2 \
--allow-extra-chr \
--bfile europe/output/file5 \
--extract europe/output/indepSNP.prune.in \
--make-king triangle bin \
--out europe/output/file6 \
--silent;
grep 'variants\|samples' europe/output/file6.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 85306 variants loaded from europe/output/file5.bim. –extract: 47596 variants remaining. 47596 variants remaining after main filters. –make-king pass 1/1: Scanning for rare variants… done. 0 variants handled by initial scan (47596 remaining). –make-king: 47596 variants processed.

Now we can use Plink2 to remove one of the mosquitoes from the pair with high kinship. It will remove 32 samples since we had 2 samples in some populations with high relatedness, and we could remove 1 and keep the other two. Plink2 always tries to maximize the number of samples passing the filters.

plink2 \
--allow-extra-chr \
--bfile europe/output/file5 \
--king-cutoff europe/output/file6 0.354 \
--make-bed \
--out europe/output/file7 \
--silent;
grep 'samples\|variants\|remaining' europe/output/file7.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 85306 variants loaded from europe/output/file5.bim. europe/output/file7.king.cutoff.out.id , and 409 remaining sample IDs written

Now we can check the individuals that were removed.

# we have 19 samples that were removed, and our file has heading, so we need to see 13 lines of the file
head -n 12 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7.king.cutoff.out.id
## #FID IID
## SIC  1231
## SIC  1235
## SIC  1236
## DES  1445
## ITB  748
## SPM  767
## TUA  779
## TUA  780
## SPS  914

4. Create chromosomal scale

library(tidyverse)
library(here)

Import the .bim file with the SNPs to create a new chromosomal scale.

#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-583035163     0   315386 A       G      
## 2 1        AX-583033356     0   315674 C       T      
## 3 1        AX-583033370     0   330057 G       A      
## 4 1        AX-583035194     0   330265 A       G      
## 5 1        AX-583033387     0   331288 C       T      
## 6 1        AX-583035257     0   442875 T       C

We can write a function to import the bim files.

#   ____________________________________________________________________________
#   function to import bim files                                            ####
#
import_bim <- function(file) {
  # import as a tibble and set columns as integers
  bim <-
    read_delim(
      file,
      col_names      = FALSE,
      show_col_types = FALSE,
      col_types      = "ccidcc"
    )
  # rename the columns by index
  bim <- bim |>
    rename(
      Scaffold = 1,
      SNP      = 2,
      Cm       = 3,
      Position = 4,
      Allele1  = 5,
      Allele2  = 6
    )
  return(bim)
}
# we can save the function to source it later
dump(                                                     # check ?dump for more information
  "import_bim",                                           # the object we want to save
  here(
    "analyses", "import_bim.R")       # use here to save it our function as .R 
)

Separate the tibbles into each chromosome.

#   ____________________________________________________________________________
#   separate the SNP data per chromosome                                    ####
# chr1
chr1_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^1."
    )
  ) |> # here we get only Scaffold rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^2."
    )
  ) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^3."
    )
  ) |>
  as_tibble()
module load SAMtools/1.18-GCC-12.2.0

Now we can index the reference genome with the new scaffold names that mach our .bim file

# index the genome
samtools faidx /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz 

Now we can get the scaffold order and their size from the .fai file. Check the about it at Samtools documentation HERE.

# check the head of the .fai file
head /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz.fai
# For each row:
# Column 1: The scaffold name. In your FASTA file, this is preceded by '>'
# Column 2: The number of bases in the scaffol
# Column 3: The byte index of the file where the scaffold sequence begins.
# Column 4: bases per line in the FASTA file
# Column 5: bytes per line in the FASTA file
# we can use awk to get the first two columns, I also change the field separator.
cat /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz.fai | awk 'BEGIN{FS=" "; OFS="\t"} {print $1, $2}' > /gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt;
echo "scaffold sizes";
# check the output
head /gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt
# since we fixed the scaffold order previous, and also moved the scaffold 1.86 to its correct position, we can move ahead and calculate the new scale for our SNPs.

Import the file with sizes of each scaffold.

#   ____________________________________________________________________________
#   import the file with the scaffold sizes                                 ####
sizes <-
  read_delim(
    here(
      "/gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cd"
  )
#
# set column names
colnames(
  sizes
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <- 
  sizes |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  ) |>
  arrange(
    Scaffold
  )                   # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)
## # A tibble: 6 × 3
##   Scaffold     Size Chromosome
##   <chr>       <dbl> <chr>     
## 1 1.1        351198 1         
## 2 1.10     11939576 1         
## 3 1.100     3389100 1         
## 4 1.101      470438 1         
## 5 1.102     2525157 1         
## 6 1.103      150026 1

Create new scale. Get the scaffolds for each chromosome.

#   ____________________________________________________________________________
#   separate the scaffold sizes tibble per chromosome                       ####
# chr1
chr1_scaffolds <- 
  sizes |>
  filter(
    str_detect(
      Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
    )
  ) |> 
  as_tibble()
#
# chr2
chr2_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
    )
  ) |> 
  as_tibble()
#
# # chr3
chr3_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
    )
  ) |>
  as_tibble()

Create a scale for each chromosome.

#   ____________________________________________________________________________
#   create a new scale for each chromosome                                  ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
  0                                                                        # we create a new column with zeros
for (i in 2:nrow(
  chr1_scaffolds
)
) {                                                                        # loop to start on second line
  chr1_scaffolds$overall_size_before_bp[i] <-                              # set position on the scale
    chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
      1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr2_scaffolds
)
) {
  chr2_scaffolds$overall_size_before_bp[i] <-
    chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
      1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr3_scaffolds
)
) {
  chr3_scaffolds$overall_size_before_bp[i] <-
    chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
      1]
}

Merge the data frames scaffolds and SNPs.

#   ____________________________________________________________________________
#   merge the data sets using the tidyverse function left_join              ####
# chr1
chr1_scale <-
  chr1_snps |>          # create data frame for each chromosome, get chr1_snps
  left_join(            # use lef_join function to merge it with chr1_scaffolds
    chr1_scaffolds,
    by = "Scaffold"
  ) |>                  # set column to use for merging (Scaffold in this case)
  na.omit() |>          # remove NAs, we don't have SNPs in every scaffold
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +                 # make new columns numeric
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr2
chr2_scale <-
  chr2_snps |>
  left_join(
    chr2_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr3
chr3_scale <-
  chr3_snps |>
  left_join(
    chr3_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )

Merge all chromosome scales.

#   ____________________________________________________________________________
#   merge the data sets, and select only the columns we need                ####
chroms <- rbind(
  chr1_scale, chr2_scale, chr3_scale
) |>
  dplyr::select(
    Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
  )

Save the new .bim file

#   ____________________________________________________________________________
#   save the new bim file with a new name, I added "B"                      ####
write.table(
  chroms,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7B.bim"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Rename the .bim files

# change the name of the first .bim file, for example, append _backup.bim, and then replace the original file
mv europe/output/file7.bim europe/output/file7_backup.bim;
# than change the new bim we create to the original name (do it only once, otherwise it will mess up)
mv europe/output/file7B.bim europe/output/file7.bim

Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.

plink2 \
--bfile europe/output/file7 \
--make-bed \
--out europe/output/test01;
# then we remove the files 
rm europe/output/test01.*

PLINK v2.00a3.7LM AVX2 Intel (24 Oct 2022) www.cog-genomics.org/plink/2.0/ (C) 2005-2022 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to europe/output/test01.log. Options in effect: –bfile europe/output/file7 –make-bed –out europe/output/test01 Start time: Thu Sep 28 14:24:39 2023 257259 MiB RAM detected; reserving 128629 MiB for main workspace. Using up to 32 threads (change this with –threads). 409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from europe/output/file7.fam. 85306 variants loaded from europe/output/file7.bim. Note: No phenotype data present. Writing europe/output/test01.fam … done. Writing europe/output/test01.bim … done. Writing europe/output/test01.bed … done. End time: Thu Sep 28 14:24:39 2023

No warnings from Plink2! Now, we can go ahead with our analysis.

4.1. Estimate LD blocks using chromosomal scale

mkdir -p europe/output/files/ld/blocks_chr
module load PLINK/1.9b_6.21-x86_64

We can use Plink1.9 to estimate LD blocks for the populations with more than 12 individuals. We will use the entire genome for this part instead of the larger scaffolds only. We will set max distance of LD blocks of 500kb. We found out that the average half distance of r^2 max is small, from 1 to 5kb

for fam in $(awk '{print $1}' europe/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile europe/output/file7 \
--keep-fam /dev/stdin \
--maf 0.1 \
--blocks no-pheno-req \
--blocks-max-kb 200 \
--out europe/output/files/ld/blocks_chr/$fam \
--geno 0.1 \
--silent
done;
#
rm europe/output/files/ld/blocks_chr/*.nosex

Now we can get some data from our .log files

cd /gpfs/gibbs/pi/caccone/mkc54/albo
echo "Population,n,nVariants,geno,maf,passQC" > europe/output/files/ld/blocks_chr/table_ld_stats.csv
for file in europe/output/files/ld/blocks_chr/*.log
do
  variants=$(grep -oE '([0-9]+) variants loaded from \.bim file' $file | grep -oE '[0-9]+')
  geno=$(grep -oE '([0-9]+) variants removed due to missing genotype data \(--geno\)' $file | grep -oE '[0-9]+')
  maf=$(grep -oE '([0-9]+) variants removed due to minor allele threshold\(s\)' $file | grep -oE '[0-9]+')
  pass=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | head -1)
  n=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | tail -1)
  filename=$(basename $file .log)
  echo "$filename,$n,$variants,$geno,$maf,$pass" >> europe/output/files/ld/blocks_chr/table_ld_stats.csv
done;
head -n 5 europe/output/files/ld/blocks_chr/table_ld_stats.csv
## Population,n,nVariants,geno,maf,passQC
## ALU,12,85306,5169,20672,59465
## ALV,12,85306,4412,19004,61890
## BAR,12,85306,5963,21916,57427
## BRE,13,85306,10344,19831,55131

Reload libraries

library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
library(reticulate)
library(scales)
library(stringr)
library(grid)
library(flextable)
library(devtools)
library(readr)
library(purrr)
library(ggtext)
library(ggvenn)
library(RColorBrewer)
library(ggrepel)

We can check it out

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr/table_ld_stats.csv"
    ),
  header = TRUE,
  sep = ","
)

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 1: Summary of quality control for Europe population data.")

ft

Table 1: Summary of quality control for Europe population data.

Population

n

nVariants

geno

maf

passQC

ALU

12

85,306

5,169

20,672

59,465

ALV

12

85,306

4,412

19,004

61,890

BAR

12

85,306

5,963

21,916

57,427

BRE

13

85,306

10,344

19,831

55,131

CES

14

85,306

15,654

22,004

47,648

CRO

12

85,306

4,515

18,678

62,113

DES

16

85,306

17,901

16,361

51,044

FRS

12

85,306

3,714

15,896

65,696

GES

12

85,306

4,107

22,962

58,237

GRA

11

85,306

5,006

19,065

61,235

ITR

12

85,306

3,611

16,391

65,304

KER

12

85,306

3,938

19,589

61,779

KRA

12

85,306

6,537

18,496

60,273

MAL

12

85,306

3,856

15,217

66,233

POP

12

85,306

3,944

13,767

67,595

RAR

12

85,306

4,049

24,005

57,252

ROS

11

85,306

5,147

20,556

59,603

SEV

12

85,306

4,424

25,210

55,672

SIC

9

85,306

19,211

12,440

53,655

SLO

12

85,306

6,840

13,586

64,880

SOC

12

85,306

4,212

22,432

58,662

TIK

12

85,306

4,125

24,999

56,182

TRE

12

85,306

3,683

12,556

69,067

TUA

9

85,306

12,653

15,410

57,243

TUH

12

85,306

4,663

16,952

63,691

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/summary_blocks_chr.docx"))

Now we can count how many blocks we found in each population

cd /gpfs/gibbs/pi/caccone/mkc54/albo
wc -l europe/output/files/ld/blocks_chr/*.blocks | \
  awk '{population = gensub(/\.blocks_chr/, "", "g", $2); print population "\t" $1}' | \
  sed 's#europe/output/files/ld/blocks_chr/##' | \
  sed 's/.blocks//' | \
  sed '$d' > europe/output/files/ld/blocks_chr/populations_block_counts.csv;
head -n 30 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr/populations_block_counts.csv
## ALU  88
## ALV  89
## BAR  113
## BRE  316
## CES  399
## CRO  104
## DES  404
## FRS  70
## GES  183
## GRA  26
## ITR  95
## KER  138
## KRA  91
## MAL  76
## POP  77
## RAR  174
## ROS  80
## SEV  222
## SIC  0
## SLO  40
## SOC  136
## TIK  216
## TRE  82
## TUA  3
## TUH  132

Now we can add the number of blocks to the table we made

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr/table_ld_stats.csv"
    ),
  header = TRUE,
  sep = ","
)

# Load the population counts data from the CSV file
pop_counts <-
  read.delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr/populations_block_counts.csv"
    ),
    header = F,
    sep = "\t"
  ) |>
  rename(
    Population       = 1,
    nBlocks          = 2
  )

# Merge the population counts with the table data
ld_blocks <- merge(ld_blocks, pop_counts, by = "Population")

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.")

ft

Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.

Population

n

nVariants

geno

maf

passQC

nBlocks

ALU

12

85,306

5,169

20,672

59,465

88

ALV

12

85,306

4,412

19,004

61,890

89

BAR

12

85,306

5,963

21,916

57,427

113

BRE

13

85,306

10,344

19,831

55,131

316

CES

14

85,306

15,654

22,004

47,648

399

CRO

12

85,306

4,515

18,678

62,113

104

DES

16

85,306

17,901

16,361

51,044

404

FRS

12

85,306

3,714

15,896

65,696

70

GES

12

85,306

4,107

22,962

58,237

183

GRA

11

85,306

5,006

19,065

61,235

26

ITR

12

85,306

3,611

16,391

65,304

95

KER

12

85,306

3,938

19,589

61,779

138

KRA

12

85,306

6,537

18,496

60,273

91

MAL

12

85,306

3,856

15,217

66,233

76

POP

12

85,306

3,944

13,767

67,595

77

RAR

12

85,306

4,049

24,005

57,252

174

ROS

11

85,306

5,147

20,556

59,603

80

SEV

12

85,306

4,424

25,210

55,672

222

SIC

9

85,306

19,211

12,440

53,655

0

SLO

12

85,306

6,840

13,586

64,880

40

SOC

12

85,306

4,212

22,432

58,662

136

TIK

12

85,306

4,125

24,999

56,182

216

TRE

12

85,306

3,683

12,556

69,067

82

TUA

9

85,306

12,653

15,410

57,243

3

TUH

12

85,306

4,663

16,952

63,691

132

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/summary_ld_blocks_chr.docx"))

Get the size of each block from the .block.det files

get_kb_column <- function(dir_path) {
  # obtain the list of files with extension .blocks.det
  file_names <- list.files(path = dir_path, pattern = "\\.blocks\\.det$", full.names = TRUE)
  
  # create an empty list to hold the data frames
  block_list <- list()
  
  # loop through the files and read the data into the list
  for (file in file_names) {
    df <- read.table(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
    # select only the KB column and add it to the block_list with the file name
    block_list[[file]] <- df %>% dplyr::select(KB) %>% add_column(file = file, .before = 1)
  }
  
  # combine the data frames in the block_list into a single data frame
  blocks <- bind_rows(block_list)
  
  # clean up the file name column
  blocks$file <- str_remove(blocks$file, "^.*\\/ld\\/blocks\\/")
  
  return(blocks)
}

# example usage: replace dir_path with your directory path
dir_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr")

blocks<- 
  get_kb_column(dir_path) |>
  mutate(file = str_remove(file, "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr/")) |>
  mutate(file = str_remove(file, ".blocks.det")) |>

  as_tibble() |>
  rename(
    Population = 1
  )

Create density plot of the size of the LD blocks Plink found

# to check how many colors we need
# n_distinct(blocks$Population) #24

source(
  here(
    "my_theme2.R"
  )
)

# define the color palette 
colors <- 
  c("#52ef99", "#146c45", "#75d5e1", "#2c4a5e", "#6a8fe0", "#8c61cd", "#f365e7", "#871550", "#f6c8de", "#a113b2", "#cf749b", "#2524f9",  "#cddb9b", "#799d10", "#a7e831", "#754819", "#fda547", "#a41415", "#fd5917", "#fd4e8b", "#ead624", "#21a708", "#332288", "#51f310", "#9d8d88")


# make plot using the sample y scale for all populations
ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = colors) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3) + my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here(
    "output",
    "europe",
    "figures",
    "block_density_y_scale_fixed_chr_europe.pdf"
  ),
  width = 10,
  height = 10,
  units = "in"
)

Make plot allowing the y axis scale free

ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = colors) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3, scales = "free_y") +
  my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here(
    "output",
    "europe",
    "figures",
    "block_density_free_scale_chr_europe.pdf"
  ),
  width = 20,
  height = 18,
  units = "in"
)

5. Plot SNP density

After quality control with approximately 85k SNPs

# load the function that we saved earlier
source(
  here(
    "analyses", "import_bim.R"
  ),
  local = knitr::knit_global()
)
# import the file
snp_den_qc <- import_bim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7.bim"
  )
)

Make plot of the SNP density

#   plot SNP density after QC  
snp_den_qc |>
  rename(
    Chromosome = 1
  ) |>
  mutate(
    Position           = as.numeric(
      Position
    )
  ) |>
  ggplot(
    aes(
      x                = Position
    ),
    label              = sprintf(
      "%0.2f",
      round(
        a,
        digits         = 0
      )
    )
  ) +
  geom_histogram(
    aes(
      y                = after_stat(
        count
      )
    ),
    binwidth           = 1e6
  ) +
  facet_wrap(
    vars(
      Chromosome
    ),
    scales             = "free_x"
  ) +
  labs(
    title              = "SNP Density after QC",
    x                  = expression(
      "Position in the genome (Mb)"
    ),
    y                  = expression(
      "Number of SNPs"
    )
  ) +
  scale_x_continuous(
    labels             = function(x) {
      format(
        x / 1e6,
        big.mark       = ",", 
        scientific     = FALSE
      )
    }
  ) +
  geom_density(
  aes(
    y = 1e6 * after_stat(count)
  ),
  color = "red",
  linewidth = .75,
  alpha = .4,
  fill = "pink"
  ) +
    theme(
    panel.grid.major   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.grid.minor   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.spacing      = unit(0.5, "lines"),
    strip.text         = element_text(
      face             = "bold", hjust = .5
    ),
    strip.background.x = element_rect(
      color            = "gray"
    )
  )

#   save the density plot
ggsave(
  here(
    "output", "europe", "figures", "snp_density_after_qc_europe.pdf"
  ),
  width  = 10,
  height = 6,
  units  = "in"
)

SNPs per chromosome

# we can use dplyr "count" to get the number of SNPs for each chromosome
# lets get the data we need
snps_per_chrm <- 
  snp_den_qc |>
  count(
    Scaffold) |>
  rename(
    Chromosome = 1,
    "SNPs (N) " = 2
  )

# Create the flextable
ft <- flextable::flextable(snps_per_chrm)

# Apply zebra theme
ft <- flextable::theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "SNPs per chromosome after quality control")
ft

SNPs per chromosome after quality control

Chromosome

SNPs (N)

1

18,957

2

35,754

3

30,595

We can get the mean number of SNPs per chromosome for the entire genome

# we first use dplyr cut_width to get the number of SNPs per 1Mb window
albo_den <- 
  snp_den_qc |>
  dplyr::select(
    Scaffold, Position
  ) |>
  group_by(
    Scaffold,
    windows               = cut_width(
      Position,
      width               = 1e6,
      boundary            = 0
    )
  ) |>
  summarise(
    n                     = n(),
    .groups               = "keep"
  ) |>
  group_by(
    Scaffold
  ) |>
  summarise(
    mean                  = mean(n),
    n                     = n(),
    .groups               = "keep"
  ) |>
  rename(
    Chromosome            = 1,
    "SNPs per 1Mb window" = 2,
    "Number of windows"   = 3
  )
#
# check the results
snp_table <-
  flextable(
    albo_den
  )
snp_table <- colformat_double(
  x        = snp_table,
  big.mark = ",",
  digits   = 2,
  na_str   = "N/A"
)
snp_table

Chromosome

SNPs per 1Mb window

Number of windows

1

51.80

366

2

61.75

579

3

62.69

488

Merge objects

# we can merge the two data sets we created above into one table
after_qc <-
  snps_per_chrm |>
  left_join(
    albo_den,
    by = "Chromosome"
  )
snp_table2 <- flextable(
  after_qc)
snp_table2 <- colformat_double(
  x        = snp_table2,
  big.mark = ",",
  digits   = 2, 
  na_str   = "N/A"
  )
snp_table2

Chromosome

SNPs (N)

SNPs per 1Mb window

Number of windows

1

18,957

51.80

366

2

35,754

61.75

579

3

30,595

62.69

488

We also want to explore the ancestry of our populations at chromosome level. Ae. albopictus has homomorphic chromosomes, where there is no differentiation between autosomes and sexual chromosomes. Sex determination is controlled by the M locus located on chromosome 1 p arm (short arm) [@gomulskiNixLocusMalespecific2018]. There some indications that increased linkage due to M locus in chromosome 1 will result in the formation of a sex chromosome over time [@ayalaChromosomeSpeciationHumans2005]. Due to increased linkage in M locus and low recombination between parental chromosomes because of selection pressure to maintain the sex determination functional, we might detect differences in the resolution of our ancestry analysis. For example, due to increased linkage and lower recombination rates in chromosome 1, we might observe some differences in the performance of the algorithms we use to evaluate ancestry in this species. Therefore, we can subset our data set and run the same analysis. We can subset per chromosome and chromosome 2 + 3. We can use R to create SNP lists for each subset we want to test. Then, we can use Plink2 to extract the SNPs lists and create files for ancestry analysis.

6. LD prunimng using the chromosomal scale

# we set a window of variants of 5 and move the window 1 variant per time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
plink2 \
--allow-extra-chr \
--bfile europe/output/file10 \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP_chr \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_chr.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 85306 variants loaded from europe/output/file10.bim. –indep-pairwise (3 compute threads): 37822/85306 variants removed.

Lets do the scaffold again

First need to import same SNP list

Import the .bim file with the SNPs

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7_backup.bim" #output/populations/file4.bim
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-583035163     0   315386 A       G      
## 2 1.1      AX-583033356     0   315674 C       T      
## 3 1.1      AX-583033370     0   330057 G       A      
## 4 1.1      AX-583035194     0   330265 A       G      
## 5 1.1      AX-583033387     0   331288 C       T      
## 6 1.10     AX-583035257     0    91677 T       C
write.table(
  snps,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_all_scaffolds_4ld.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

nrow(snps)
plink2 \
--allow-extra-chr \
--bfile europe/output/file5 \
--king-cutoff europe/output/file6 0.354 \
--make-bed \
--out europe/output/file11a \
--silent;
grep 'samples\|variants\|remaining' europe/output/file11a.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 85306 variants loaded from europe/output/file5.bim. europe/output/file11a.king.cutoff.out.id , and 409 remaining sample IDs written

# we set a window of variants of 5 and move the window 1 variant each time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. We used --indep-pairwise 5 1 0.1 before. We can use the same values from the mean half distance max r2
plink2 \
--allow-extra-chr \
--bfile europe/output/file11a \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP_scaffolds \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_scaffolds.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 85306 variants loaded from europe/output/file11a.bim. –indep-pairwise (28 compute threads): 37816/85306 variants removed.

Now we can compare the two sets of SNPs using scaffolds or chromosomal scale

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_chr.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_scaffolds <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_scaffolds.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_scaffolds <- unlist(prunout_scaffolds)
prunout_chr <- unlist(prunout_chr)


# Calculate shared values
prunout <- intersect(prunout_chr, prunout_scaffolds)

# Create Venn diagram
venn_data1 <- list(
  "Chromosomal" = prunout_chr,
  "Scaffolds" = prunout_scaffolds
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of genomic scales for linked SNPs") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("output", "populations", "figures", "SNPs_linked_comparison.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

7. Intergenic unlinked SNPs

We can also create a file to run the ancestry analyses using only intergenic SNPs

We can use the intergenic SNPs as “neutral” SNPs’

plink2 \
--allow-extra-chr \
--bfile europe/output/file7 \
--geno 0.2 \
--maf 0.1 \
--make-bed \
--extract europe/output/SnpEff/intergenic_SNPs.txt \
--out europe/output/intergenic \
--silent;
grep "samples\|variants" europe/output/intergenic.log 

8. Pruning r2 for LD-pruned datasets

We already have a dataset made using the chromosomal scale that pruned at 0.1 (made in step 4). If needed, can create it again using the code below

plink2 \
--allow-extra-chr \
--bfile europe/output/file10 \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP_chr \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_chr.log

The indepSNP_chr.prune.in file produced here = those SNPs < our 0.1 threshold, that we want to use

Now we need to make pruned dataset for 0.01.

plink2 \
--allow-extra-chr \
--bfile europe/output/file10 \
--indep-pairwise 5 1 0.01 \
--out europe/output/indepSNP_chr_r01 \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_chr_r01.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 85306 variants loaded from europe/output/file10.bim. –indep-pairwise (3 compute threads): 68278/85306 variants removed.

indepSNP_chr.prune.in & indepSNP_chr_r01.prune.in have the SNPs we want to use in our LD-pruned datasets

Use the SNP set with the LD pruning we just did and extract them from file10 to get SNP Set 1

plink2 \
--keep-allele-order \
--bfile europe/output/file10 \
--make-bed \
--export vcf \
--out europe/output/snps_sets/r2_0.01 \
--extract europe/output/indepSNP_chr_r01.prune.in \
--silent
grep "variants\|samples" europe/output/snps_sets/r2_0.01.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 85306 variants loaded from europe/output/file10.bim. –extract: 17028 variants remaining. 17028 variants remaining after main filters.

Repeat for 0.1 dataset (SNP Set 2)

plink2 \
--keep-allele-order \
--bfile europe/output/file10 \
--make-bed \
--export vcf \
--out europe/output/snps_sets/r2_0.1 \
--extract europe/output/indepSNP_chr.prune.in \
--silent
grep "variants\|samples" europe/output/snps_sets/r2_0.1.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 85306 variants loaded from europe/output/file10.bim. –extract: 47484 variants remaining. 47484 variants remaining after main filters.

These snp set can now be used for subsequent pop gen analyses.

9. Make a new snp set by filtering Minor Allele Frequency < 1% (MAF 0.01) - SNP Set 3

9.2. Minor allele frequency

First, we estimate the allele frequency with Plink.

cd /gpfs/gibbs/pi/caccone/mkc54/albo
plink2 \
--allow-extra-chr \
--bfile europe/output/file3 \
--freq \
--out europe/output/MAF_check \
--silent

Now we apply the MAF filter for 0.01

# We will use MAF of 10%
plink2 \
--allow-extra-chr \
--bfile europe/output/file3 \
--maf 0.01 \
--make-bed \
--out europe/output/file4b \
--silent;
grep "variants" europe/output/file4b.log

107776 variants loaded from europe/output/file3.bim. 7329 variants removed due to allele frequency threshold(s) 100447 variants remaining after main filters.

Then we plot it with ggplot.

#   Import MAF data                                                         ####
maf_freq <-
  read.delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/MAF_check.afreq"
    ),
    header = TRUE
  )

Make MAF plot

#   make the plot                                                           ####
ggplot(
  maf_freq,
  aes(ALT_FREQS)
) +
  geom_histogram(
    colour = "black",
    fill = "lightgray",
    bins = 40
  ) +
  labs(
    x = "Minor Allele Frequency (MAF)",
    y = "Frequency (n)",
    caption = "<span style='color:red;'><i>Red</i></span> <span style='color:black;'><i>line at</i></span><span style='color:red;'><i> MAF 1%</i></span><span style='color:black;'><i> threshold</i></span>."
  ) +
  geom_text(
    aes(
      x = .01,
      y = 8000,
      label = paste0("7,329 SNPs")
    ),
    size = 3,
    color = "red",
    vjust = -.2
  ) +
  geom_vline(xintercept = 0.01, color = "red") +
  my_theme() +
  theme(plot.caption = element_markdown()) +
  scale_y_continuous(label = scales::number_format(big.mark = ",")) +
  scale_x_continuous(breaks = c(0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0))

# save the plot
ggsave(
  here("output", "europe", "figures", "MAF_0.01_freq_plot_europe.pdf"
  ),
  width  = 7,
  height = 5,
  units  = "in"
)

We removed 7,329 variants due to the MAF filter.

9.3. HWE test

Now we can run the HWE test. However, we will need to apply the SNP missingness again for each population. If we do not, the HWE will vary widely. With the bash script below, we will create a new file for each population, run the HWE test with HWE p value <1e‐6 (HWE p value <1e‐6). Then, we ask Plink to generate a list of SNPs that passed the test for each population.

mkdir -p europe/output/files/hardyb;
cp europe/output/file4b.* europe/output/files/hardyb/
for fam in $(awk '{print $1}' europe/output/files/hardyb/file4b.fam | sort | uniq); 
do 
echo $fam | \
plink2 \
--allow-extra-chr \
--silent \
--keep-allele-order \
--bfile europe/output/files/hardyb/file4b \
--keep-fam /dev/stdin \
--make-bed \
--out europe/output/files/hardyb/$fam \
--hwe 0.000001 \
--geno 0.1 \
--write-snplist; \
done

Next, we use “cat” and “awk” to concatenate the SNP list from all populations, and remove duplicates. Once we have a list of SNPs that passed the test for each population, we can use Plink to create a new bed file keeping only the SNPs that passed the test in each population. First, lets get the list of SNPs, and count how many passed:

cd /gpfs/gibbs/pi/caccone/mkc54/albo
cat europe/output/files/hardyb/*.snplist | awk '!a[$0]++' > europe/output/files/passed_hwe_b.txt;
wc -l europe/output/files/passed_hwe_b.txt
## 100447 europe/output/files/passed_hwe_b.txt

100447 europe/output/files/passed_hwe_b.txt All 100447 variants passed HWE test. If any failed, we could remove the variants that did not pass HWE test, using the –extract flag, extracting only those that passed HWE.

9.4. LD pruning

Since we do not have to remove any SNPs due to deviation from HWE, we can proceed with heterozygosity estimates. The first step is to “prune” our data set. We will check the pairwise linkage estimates for all SNPs. We can work with file4. We will use “indep-pairwise” to check if there are SNPs above a certain linkage disequilibrium (LD) threshold. Check Plink documentation for more details https://www.cog-genomics.org/plink/1.9/ld I used “--indep-pairwise 5 1 0.1” , which indicates according to the documentation: --indep-pairphase <window size>['kb'] <step size (variant ct)> <r^2 threshold> We will check in a window of 5kb if there is any pair of SNPs with r2 estimates above 0.1, then we will move our window 1 SNP and check again for SNPs above the threshold. We will repeat this procedure until we check the entire genome.

Need to switch back to plink2

module load PLINK/2_avx2_20221024
#PLINK/1.9b_6.21-x86_64 => PLINK/2_avx2_20221024
plink2 --version
plink2 \
--allow-extra-chr \
--bfile europe/output/file4b \
--extract europe/output/files/passed_hwe_b.txt \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNPb \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNPb.log

–indep-pairwise 5 1 0.1 423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 100447 variants loaded from europe/output/file4b.bim. –extract: 100447 variants remaining. 100447 variants remaining after main filters. –indep-pairwise (28 compute threads): 40576/100447 variants removed.

Remember, the SNPs are not removed from our data set. Plink created 3 files when we ran the code above. One is the “indepSNP.log” file, and the other two are: “indepSNP.prune.in” -> list of SNPs with squared correlation smaller than our r2 threshold of 0.1. “indepSNP.prune.out” -> list of SNPs with squared correlation greater than our r2 threshold of 0.1. For our heterozygosity estimates, we want to use the set of SNPs that are below our r2 threshold of 0.1. We consider that they are randomly associated. We can use Plink to estimate the heterozygosity using the “indepSNP.prune.in” file.

9.6. Heterozygosity

plink2 \
--allow-extra-chr \
--bfile europe/output/file4b \
--extract europe/output/indepSNPb.prune.in \
--het \
--out europe/output/R_checkb \
--silent;
grep 'variants' europe/output/R_checkb.log

100447 variants loaded from europe/output/file4b.bim. –extract: 59871 variants remaining. 59871 variants remaining after main filters.

#   find individuals with high heterozygosity                              ####
# import the data from Plink
het <- read.delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/R_checkb.het"
  ),
  head = TRUE
)
#
# check head of the file
colnames(het)
## [1] "X.FID"  "IID"    "O.HOM." "E.HOM." "OBS_CT" "F"

Estimate heterozygosity

# create a column named HET_RATE and calculate the heterozygosity rate
het$HET_RATE <- (het$"OBS_CT" - het$"O.HOM") / het$"OBS_CT"
#
# use subset function to get values deviating from 4sd of the mean heterozygosity rate.
het_fail <-
  subset(
    het, (het$HET_RATE < mean(
      het$HET_RATE
    ) - 4 * sd(
      het$HET_RATE
    )) |
      (het$HET_RATE > mean(
        het$HET_RATE
      ) + 4 * sd(
        het$HET_RATE
      ))
  )
#
# get the list of individuals that failed our threshold of 4sd from the mean.
het_fail$HET_DST <-
  (het_fail$HET_RATE - mean(
    het$HET_RATE
  )) / sd(
    het$HET_RATE
  )

Save the files to use with Plink

#   save the data to use with Plink2                                        ####
#
write.table(
  het_fail,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fail-het-qc_b.txt"
  ),
  row.names = FALSE
)

Make plot

#   make a heterozygosity plot                                              ####
#
ggplot(
  het,
  aes(
    HET_RATE
  )
) +
  geom_histogram(
    colour           = "black",
    fill             = "lightgray",
    bins             = 40
  ) +
  labs(
    x                = "Heterozygosity Rate",
    y                = "Number of Individuals"
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      )
    ),
    col              = "red",
    linewidth        = 1.5
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) + 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) - 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) + 
  my_theme() +
  scale_y_continuous(
  )

#   save the heterozygosity plot                                            ####
ggsave(
  here("output", "europe", "figures", "Heterozygosity_europe_MAF_0.01.pdf"
  ),
  width  = 5,
  height = 4,
  units  = "in"
)

The red line in the plot above indicates the mean, and the gray lines indicate 4 standard deviation from the mean. We can see that some mosquitoes do have excess heterozygous sites. We will remove them. We can get their ID from the file “fail-het-qc.txt”. We can use the bash script below to parse the file to use with Plink

cd /gpfs/gibbs/pi/caccone/mkc54/albo
sed 's/"// g' europe/output/fail-het-qc_b.txt | awk '{print$1, $2}'> europe/output/het_fail_ind_b.txt;
echo 'How many mosquitoes we need to remove from our data set:'; #5
cat europe/output/het_fail_ind_b.txt | tail -n +2 | wc -l;
echo 'Which mosquitoes we have to remove:';
tail -n +2 europe/output/het_fail_ind_b.txt
## How many mosquitoes we need to remove from our data set:
## 5
## Which mosquitoes we have to remove:
## GRA 734
## TUA 783
## ITP 829
## ITP 832
## ROS 858

Heterozygosity is outside the SD threshold in 1 GRA (734), 1 TUA (783), 2 ITP (829, 832), and 1 ROS (858). We will remove these 5 individuals. Next, we will remove these mosquitoes from our data set using Plink:

plink2 \
--allow-extra-chr \
--bfile europe/output/file4b \
--remove europe/output/het_fail_ind_b.txt \
--make-bed \
--out europe/output/file5b \
--silent;
grep 'variants\|samples' europe/output/file5b.log

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 100447 variants loaded from europe/output/file4b.bim. –remove: 418 samples remaining. 418 samples (44 females, 1 male, 373 ambiguous; 418 founders) remaining after

9.7. Relatedness

Check for cryptic relatedness. Check Plink2 documentation https://www.cog-genomics.org/plink/2.0/distance You can download King directly https://www.kingrelatedness.com/manual.shtml or check their manuscript https://www.ncbi.nlm.populations.gov/pmc/articles/PMC3025716/pdf/btq559.pdf From Plink2 documentation: “Note that KING kinship coefficients are scaled such that duplicate samples have kinship 0.5, not 1. First-degree relations (parent-child, full siblings) correspond to ~0.25, second-degree relations correspond to ~0.125, etc. It is conventional to use a cutoff of ~0.354 (the geometric mean of 0.5 and 0.25) to screen for monozygotic twins and duplicate samples, ~0.177 to add first-degree relations, etc.” There two options. One is to run only –make-king and another one is to use –make-king-table We will use the threshold of 0.354 and create a table.

# Plink2 will create a file with extension .king
plink2 \
--allow-extra-chr \
--bfile europe/output/file5b \
--extract europe/output/indepSNPb.prune.in \
--make-king-table rel-check \
--king-table-filter 0.354 \
--out europe/output/file5b \
--silent;
grep 'variants\|samples' europe/output/file5b.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 100447 variants loaded from europe/output/file5b.bim. –extract: 59871 variants remaining. 59871 variants remaining after main filters.

Check the individuals that did not pass our filtering.

cd /gpfs/gibbs/pi/caccone/mkc54/albo
head -n 14 europe/output/file5b.kin0
## #FID1    IID1    FID2    IID2    NSNP    HETHET  IBS0    KINSHIP
## DES  1445    DES 1292    57878   0.233681    0.00438854  0.442954
## ITB  748 ITB 747 58207   0.233889    0.0104455   0.39944
## SIC  1231    SIC 1229    56168   0.238534    0.00322248  0.45095
## SIC  1235    SIC 1233    56705   0.223772    0.0068777   0.402903
## SIC  1235    SIC 1234    56068   0.208497    0.00986302  0.362625
## SIC  1236    SIC 1234    56308   0.214002    0.00898629  0.377089
## SPS  915 SPS 914 58015   0.19719 0.0103939   0.362395
## TUA  779 TUA 776 57575   0.21485 0.0127312   0.370149
## TUA  780 TUA 777 57837   0.220464    0.00933658  0.387552

We want to remove one of the individuals of the pairs.

plink2 \
--allow-extra-chr \
--bfile europe/output/file5b \
--extract europe/output/indepSNPb.prune.in \
--make-king triangle bin \
--out europe/output/file6b \
--silent;
grep 'variants\|samples' europe/output/file6b.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 100447 variants loaded from europe/output/file5b.bim. –extract: 59871 variants remaining. 59871 variants remaining after main filters. –make-king pass 1/1: Scanning for rare variants… done. 374 variants handled by initial scan (59497 remaining). –make-king: 59871 variants processed.

Now we can use Plink2 to remove one of the mosquitoes from the pair with high kinship. It will remove 10 samples since we had 2 samples in some populations with high relatedness, and we could remove 1 and keep the other two. Plink2 always tries to maximize the number of samples passing the filters.

plink2 \
--allow-extra-chr \
--bfile europe/output/file5b \
--king-cutoff europe/output/file6b 0.354 \
--make-bed \
--out europe/output/file7b \
--silent;
grep 'samples\|variants\|remaining' europe/output/file7b.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 100447 variants loaded from europe/output/file5b.bim. europe/output/file7b.king.cutoff.out.id , and 410 remaining sample IDs written

9.8. Create chromosomal scale for MAF 0.01 files

Remove packages loaded before creating the chromosomal scale

invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE))
## Warning: 'scales' namespace cannot be unloaded:
##   namespace 'scales' is imported by 'ggplot2' so cannot be unloaded
## Warning: 'dplyr' namespace cannot be unloaded:
##   namespace 'dplyr' is imported by 'tidyr' so cannot be unloaded
## Warning: 'purrr' namespace cannot be unloaded:
##   namespace 'purrr' is imported by 'tidyr' so cannot be unloaded
## Warning: 'tibble' namespace cannot be unloaded:
##   namespace 'tibble' is imported by 'ggplot2', 'dplyr' so cannot be unloaded
library(tidyverse)
library(here)

Import the .bim file with the SNPs to create a new chromosomal scale.

#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7b.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-583033342     0   315059 C       G      
## 2 1        AX-583035163     0   315386 A       G      
## 3 1        AX-583033356     0   315674 C       T      
## 4 1        AX-583033370     0   330057 G       A      
## 5 1        AX-583035194     0   330265 A       G      
## 6 1        AX-583033387     0   331288 C       T

Separate the tibbles into each chromosome

#   separate the SNP data per chromosome                                    
# chr1
chr1_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^1."
    )
  ) |> # here we get only Scaffold rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^2."
    )
  ) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^3."
    )
  ) |>
  as_tibble()

Import the file with sizes of each scaffold.

#   import the file with the scaffold sizes                                 ####
sizes <-
  read_delim(
    here(
      "/gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cd"
  )
#
# set column names
colnames(
  sizes
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <- 
  sizes |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  ) |>
  arrange(
    Scaffold
  )                   # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)
## # A tibble: 6 × 3
##   Scaffold     Size Chromosome
##   <chr>       <dbl> <chr>     
## 1 1.1        351198 1         
## 2 1.10     11939576 1         
## 3 1.100     3389100 1         
## 4 1.101      470438 1         
## 5 1.102     2525157 1         
## 6 1.103      150026 1

Create new scale. Get the scaffolds for each chromosome.

#   separate the scaffold sizes tibble per chromosome                       ####
# chr1
chr1_scaffolds <- 
  sizes |>
  filter(
    str_detect(
      Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
    )
  ) |> 
  as_tibble()
#
# chr2
chr2_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
    )
  ) |> 
  as_tibble()
#
# # chr3
chr3_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
    )
  ) |>
  as_tibble()

Create a scale for each chromosome.

#   create a new scale for each chromosome                                  ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
  0                                                                        # we create a new column with zeros
for (i in 2:nrow(
  chr1_scaffolds
)
) {                                                                        # loop to start on second line
  chr1_scaffolds$overall_size_before_bp[i] <-                              # set position on the scale
    chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
      1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr2_scaffolds
)
) {
  chr2_scaffolds$overall_size_before_bp[i] <-
    chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
      1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr3_scaffolds
)
) {
  chr3_scaffolds$overall_size_before_bp[i] <-
    chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
      1]
}

Merge the data frames scaffolds and SNPs.

#   merge the data sets using the tidyverse function left_join              ####
# chr1
chr1_scale <-
  chr1_snps |>          # create data frame for each chromosome, get chr1_snps
  left_join(            # use lef_join function to merge it with chr1_scaffolds
    chr1_scaffolds,
    by = "Scaffold"
  ) |>                  # set column to use for merging (Scaffold in this case)
  na.omit() |>          # remove NAs, we don't have SNPs in every scaffold
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +                 # make new columns numeric
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr2
chr2_scale <-
  chr2_snps |>
  left_join(
    chr2_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr3
chr3_scale <-
  chr3_snps |>
  left_join(
    chr3_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )

Merge all chromosome scales.

#   merge the data sets, and select only the columns we need                ####
chroms <- rbind(
  chr1_scale, chr2_scale, chr3_scale
) |>
  dplyr::select(
    Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
  )
# check it
head(chroms)
## # A tibble: 0 × 6
## # ℹ 6 variables: Chromosome <chr>, SNP <chr>, Cm <int>, midPos_fullseq <dbl>,
## #   Allele1 <chr>, Allele2 <chr>

Save the new .bim file

#   ____________________________________________________________________________
#   save the new bim file with a new name, I added "B"                      ####
write.table(
  chroms,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7C.bim"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Rename the .bim files

# change the name of the first .bim file, for example, append _backup.bim, and then replace the original file
mv europe/output/file7b.bim europe/output/file7b_backup.bim;
# than change the new bim we create to the original name (do it only once, otherwise it will mess up)
mv europe/output/file7C.bim europe/output/file7b.bim

Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.

cd /gpfs/gibbs/pi/caccone/mkc54/albo

plink2 \
--bfile europe/output/file7b \
--make-bed \
--out europe/output/test01;
# then we remove the files 
rm europe/output/test01.*

PLINK v2.00a3.7LM AVX2 Intel (24 Oct 2022) www.cog-genomics.org/plink/2.0/ (C) 2005-2022 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to europe/output/test01.log. Options in effect: –bfile europe/output/file7b –make-bed –out europe/output/test01 Start time: Wed Jan 31 10:57:07 2024 257256 MiB RAM detected; reserving 128628 MiB for main workspace. Using up to 32 threads (change this with –threads). 410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from europe/output/file7b.fam. 100447 variants loaded from europe/output/file7b.bim. Note: No phenotype data present. Writing europe/output/test01.fam … done. Writing europe/output/test01.bim … done. Writing europe/output/test01.bed … done. End time: Wed Jan 31 10:57:07 2024

9.8.1. Estimate LD blocks using chromosomal scale

mkdir -p europe/output/files/ld/blocks_chr_b
module load PLINK/1.9b_6.21-x86_64

We can use Plink1.9 to estimate LD blocks for the populations with more than 12 individuals. We will use the entire genome for this part instead of the larger scaffolds only. We will set max distance of LD blocks of 500kb. We found out that the average half distance of r^2 max is small, from 1 to 5kb

for fam in $(awk '{print $1}' europe/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile europe/output/file7b \
--keep-fam /dev/stdin \
--maf 0.01 \
--blocks no-pheno-req \
--blocks-max-kb 200 \
--out europe/output/files/ld/blocks_chr_b/$fam \
--geno 0.1 \
--silent
done;
#
rm europe/output/files/ld/blocks_chr_b/*.nosex

Now we can get some data from our .log files

cd /gpfs/gibbs/pi/caccone/mkc54/albo

echo "Population,n,nVariants,geno,maf,passQC" > europe/output/files/ld/blocks_chr_b/table_ld_statsb.csv
for file in europe/output/files/ld/blocks_chr_b/*.log
do
  variants=$(grep -oE '([0-9]+) variants loaded from \.bim file' $file | grep -oE '[0-9]+')
  geno=$(grep -oE '([0-9]+) variants removed due to missing genotype data \(--geno\)' $file | grep -oE '[0-9]+')
  maf=$(grep -oE '([0-9]+) variants removed due to minor allele threshold\(s\)' $file | grep -oE '[0-9]+')
  pass=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | head -1)
  n=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | tail -1)
  filename=$(basename $file .log)
  echo "$filename,$n,$variants,$geno,$maf,$pass" >> europe/output/files/ld/blocks_chr_b/table_ld_statsb.csv
done;
head -n 5 europe/output/files/ld/blocks_chr_b/table_ld_statsb.csv
## Population,n,nVariants,geno,maf,passQC
## ALU,12,100447,5869,14006,80572
## ALV,12,100447,5048,12999,82400
## BAR,12,100447,6824,16396,77227
## BRE,13,100447,11630,18024,70793

Reload libraries

library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
library(reticulate)
library(scales)
library(stringr)
library(grid)
library(flextable)
library(devtools)
library(readr)
library(purrr)
library(ggtext)
library(ggvenn)
#library(data.table)
library(RColorBrewer)
library(ggrepel)

We can check it out

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr_b/table_ld_statsb.csv"
    ),
  header = TRUE,
  sep = ","
)

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 1: Summary of quality control for Europe dataset_b.")

ft

Table 1: Summary of quality control for Europe dataset_b.

Population

n

nVariants

geno

maf

passQC

ALU

12

100,447

5,869

14,006

80,572

ALV

12

100,447

5,048

12,999

82,400

BAR

12

100,447

6,824

16,396

77,227

BRE

13

100,447

11,630

18,024

70,793

CES

14

100,447

17,813

19,512

63,122

CRO

12

100,447

5,124

13,087

82,236

DES

16

100,447

20,303

11,811

68,333

FRS

12

100,447

4,219

9,690

86,538

GES

12

100,447

4,638

20,671

75,138

GRA

11

100,447

5,764

9,984

84,699

ITR

12

100,447

4,101

10,609

85,737

KER

12

100,447

4,466

15,949

80,032

KRA

12

100,447

7,343

13,234

79,870

MAL

12

100,447

4,371

8,831

87,245

POP

12

100,447

4,453

8,070

87,924

RAR

12

100,447

4,579

20,924

74,944

ROS

11

100,447

5,704

15,530

79,213

SEV

12

100,447

4,991

24,210

71,246

SIC

9

100,447

21,659

12,938

65,850

SLO

12

100,447

7,624

7,563

85,260

SOC

12

100,447

4,728

18,025

77,694

TIK

12

100,447

4,646

21,881

73,920

TRE

12

100,447

4,152

7,541

88,754

TUA

9

100,447

14,341

15,721

70,385

TUH

12

100,447

5,326

11,173

83,948

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/summary_blocks_chr_b.docx"))

Now we can count how many blocks we found in each population

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr_b
wc -l *.blocks | \
  awk '{population = gensub(/\.blocks_chr/, "", "g", $2); print population "\t" $1}' | \
  sed 's#europe/output/files/ld/blocks_chr_b/##' | \
  sed 's/.blocks//' | \
  sed '$d' > populations_block_counts_b.csv;
head -n 79 populations_block_counts_b.csv

Now we can add the number of blocks to the table we made

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr_b/table_ld_statsb.csv"
    ),
  header = TRUE,
  sep = ","
)

# Load the population counts data from the CSV file
pop_counts <-
  read.delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr_b/populations_block_counts_b.csv"
    ),
    header = F,
    sep = "\t"
  ) |>
  rename(
    Population       = 1,
    nBlocks          = 2
  )

# Merge the population counts with the table data
ld_blocks <- merge(ld_blocks, pop_counts, by = "Population")

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.")

ft

Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.

Population

n

nVariants

geno

maf

passQC

nBlocks

ALU

12

100,447

5,869

14,006

80,572

79

ALV

12

100,447

5,048

12,999

82,400

82

BAR

12

100,447

6,824

16,396

77,227

107

BRE

13

100,447

11,630

18,024

70,793

289

CES

14

100,447

17,813

19,512

63,122

368

CRO

12

100,447

5,124

13,087

82,236

93

DES

16

100,447

20,303

11,811

68,333

378

FRS

12

100,447

4,219

9,690

86,538

60

GES

12

100,447

4,638

20,671

75,138

166

GRA

11

100,447

5,764

9,984

84,699

29

ITR

12

100,447

4,101

10,609

85,737

83

KER

12

100,447

4,466

15,949

80,032

130

KRA

12

100,447

7,343

13,234

79,870

79

MAL

12

100,447

4,371

8,831

87,245

75

POP

12

100,447

4,453

8,070

87,924

66

RAR

12

100,447

4,579

20,924

74,944

158

ROS

11

100,447

5,704

15,530

79,213

69

SEV

12

100,447

4,991

24,210

71,246

212

SIC

9

100,447

21,659

12,938

65,850

0

SLO

12

100,447

7,624

7,563

85,260

39

SOC

12

100,447

4,728

18,025

77,694

123

TIK

12

100,447

4,646

21,881

73,920

193

TRE

12

100,447

4,152

7,541

88,754

73

TUA

9

100,447

14,341

15,721

70,385

3

TUH

12

100,447

5,326

11,173

83,948

125

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/summary_blocks_chr_b.docx"))

Get the size of each block from the .block.det files

get_kb_column <- function(dir_path) {
  # obtain the list of files with extension .blocks.det
  file_names <- list.files(path = dir_path, pattern = "\\.blocks\\.det$", full.names = TRUE)
  
  # create an empty list to hold the data frames
  block_list <- list()
  
  # loop through the files and read the data into the list
  for (file in file_names) {
    df <- read.table(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
    # select only the KB column and add it to the block_list with the file name
    block_list[[file]] <- df %>% dplyr::select(KB) %>% add_column(file = file, .before = 1)
  }
  
  # combine the data frames in the block_list into a single data frame
  blocks <- bind_rows(block_list)
  
  # clean up the file name column
  blocks$file <- str_remove(blocks$file, "^.*\\/ld\\/blocks\\/")
  
  return(blocks)
}

# example usage: replace dir_path with your directory path
dir_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr_b")

blocks<- 
  get_kb_column(dir_path) |>
  mutate(file = str_remove(file, "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/files/ld/blocks_chr_b/")) |>
  mutate(file = str_remove(file, ".blocks.det")) |>

  as_tibble() |>
  rename(
    Population = 1
  )

Create density plot of the size of the LD blocks Plink found

source(
  here("my_theme2.R"
  )
)

n <- 50
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))

col_vector
##  [1] "#7FC97F" "#BEAED4" "#FDC086" "#FFFF99" "#386CB0" "#F0027F" "#BF5B17"
##  [8] "#666666" "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02"
## [15] "#A6761D" "#666666" "#A6CEE3" "#1F78B4" "#B2DF8A" "#33A02C" "#FB9A99"
## [22] "#E31A1C" "#FDBF6F" "#FF7F00" "#CAB2D6" "#6A3D9A" "#FFFF99" "#B15928"
## [29] "#FBB4AE" "#B3CDE3" "#CCEBC5" "#DECBE4" "#FED9A6" "#FFFFCC" "#E5D8BD"
## [36] "#FDDAEC" "#F2F2F2" "#B3E2CD" "#FDCDAC" "#CBD5E8" "#F4CAE4" "#E6F5C9"
## [43] "#FFF2AE" "#F1E2CC" "#CCCCCC" "#E41A1C" "#377EB8" "#4DAF4A" "#984EA3"
## [50] "#FF7F00" "#FFFF33" "#A65628" "#F781BF" "#999999" "#66C2A5" "#FC8D62"
## [57] "#8DA0CB" "#E78AC3" "#A6D854" "#FFD92F" "#E5C494" "#B3B3B3" "#8DD3C7"
## [64] "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69" "#FCCDE5"
## [71] "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"
# make plot using the sample y scale for all populations
ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3) + my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("output",
    "europe",
    "figures",
    "block_density_y_scale_fixed_chr_europe_b.pdf"
  ),
  width = 10,
  height = 10,
  units = "in"
)

Make plot allowing the y axis scale free

ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3, scales = "free_y") +
  my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("output",
    "europe",
    "figures",
    "block_density_free_scale_chr_europe_b.pdf"
  ),
  width = 20,
  height = 18,
  units = "in"
)

9.9. Plot SNP density

After quality control with approximately 100k SNPs

# load the function that we saved earlier
source(
  here("analyses", "import_bim.R"
  ),
  local = knitr::knit_global()
)
# import the file
snp_den_qc <- import_bim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7b.bim"
  )
)

Make plot of the SNP density

#   ____________________________________________________________________________
#   plot SNP density after QC                                               ####
snp_den_qc |>
  rename(
    Chromosome = 1
  ) |>
  mutate(
    Position           = as.numeric(
      Position
    )
  ) |>
  ggplot(
    aes(
      x                = Position
    ),
    label              = sprintf(
      "%0.2f",
      round(
        a,
        digits         = 0
      )
    )
  ) +
  geom_histogram(
    aes(
      y                = after_stat(
        count
      )
    ),
    binwidth           = 1e6
  ) +
  facet_wrap(
    vars(
      Chromosome
    ),
    scales             = "free_x"
  ) +
  labs(
    title              = "SNP Density after QC",
    x                  = expression(
      "Position in the genome (Mb)"
    ),
    y                  = expression(
      "Number of SNPs"
    )
  ) +
  scale_x_continuous(
    labels             = function(x) {
      format(
        x / 1e6,
        big.mark       = ",", 
        scientific     = FALSE
      )
    }
  ) +
  geom_density(
  aes(
    y = 1e6 * after_stat(count)
  ),
  color = "red",
  linewidth = .75,
  alpha = .4,
  fill = "pink"
  ) +
    theme(
    panel.grid.major   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.grid.minor   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.spacing      = unit(0.5, "lines"),
    strip.text         = element_text(
      face             = "bold", hjust = .5
    ),
    strip.background.x = element_rect(
      color            = "gray"
    )
  )

#   save the density plot
ggsave(
  here("output", "europe","figures", "snp_density_after_qc_europeb.pdf"
  ),
  width  = 10,
  height = 6,
  units  = "in"
)

SNPs per chromosome

# we can use dplyr "count" to get the number of SNPs for each chromosome
# lets get the data we need
snps_per_chrm <- 
  snp_den_qc |>
  count(
    Scaffold) |>
  rename(
    Chromosome = 1,
    "SNPs (N) " = 2
  )

# Create the flextable
ft <- flextable::flextable(snps_per_chrm)

# Apply zebra theme
ft <- flextable::theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "SNPs per chromosome after quality control")
ft

SNPs per chromosome after quality control

Chromosome

SNPs (N)

1

22,358

2

42,076

3

36,013

We can get the mean number of SNPs per chromosome for the entire genome

# we first use dplyr cut_width to get the number of SNPs per 1Mb window
albo_den <- 
  snp_den_qc |>
  dplyr::select(
    Scaffold, Position
  ) |>
  group_by(
    Scaffold,
    windows               = cut_width(
      Position,
      width               = 1e6,
      boundary            = 0
    )
  ) |>
  summarise(
    n                     = n(),
    .groups               = "keep"
  ) |>
  group_by(
    Scaffold
  ) |>
  summarise(
    mean                  = mean(n),
    n                     = n(),
    .groups               = "keep"
  ) |>
  rename(
    Chromosome            = 1,
    "SNPs per 1Mb window" = 2,
    "Number of windows"   = 3
  )
#
# check the results
snp_table <-
  flextable(
    albo_den
  )
snp_table <- colformat_double(
  x        = snp_table,
  big.mark = ",",
  digits   = 2,
  na_str   = "N/A"
)
snp_table

Chromosome

SNPs per 1Mb window

Number of windows

1

60.43

370

2

72.42

581

3

73.80

488

Merge objects

# we can merge the two data sets we created above into one table
after_qc <-
  snps_per_chrm |>
  left_join(
    albo_den,
    by = "Chromosome"
  )
snp_table2 <- flextable(
  after_qc)
snp_table2 <- colformat_double(
  x        = snp_table2,
  big.mark = ",",
  digits   = 2, 
  na_str   = "N/A"
  )
snp_table2

Chromosome

SNPs (N)

SNPs per 1Mb window

Number of windows

1

22,358

60.43

370

2

42,076

72.42

581

3

36,013

73.80

488

9.10. LD pruning using the chromosomal scale for MAF 0.01

# we set a window of variants of 5 and move the window 1 variant per time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. Try --indep-pairwise 200kb 1 0.1
plink2 \
--allow-extra-chr \
--bfile europe/output/file7b \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP_chrb \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_chrb.log

–indep-pairwise 5 1 0.1 410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from 100447 variants loaded from europe/output/file7b.bim. –indep-pairwise (3 compute threads): 40644/100447 variants removed.

Lets do the scaffold again

First need to import same SNP list Import the .bim file with the SNPs

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7b_backup.bim" #output/populations/file4.bim
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-583033342     0   315059 C       G      
## 2 1.1      AX-583035163     0   315386 A       G      
## 3 1.1      AX-583033356     0   315674 C       T      
## 4 1.1      AX-583033370     0   330057 G       A      
## 5 1.1      AX-583035194     0   330265 A       G      
## 6 1.1      AX-583033387     0   331288 C       T
write.table(
  snps,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_all_scaffolds_4ld_b.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)
nrow(snps)
## [1] 100447
plink2 \
--allow-extra-chr \
--bfile europe/output/file5b \
--king-cutoff europe/output/file6b 0.354 \
--make-bed \
--out europe/output/file11b \
--silent;
grep 'samples\|variants\|remaining' europe/output/file11b.log

418 samples (44 females, 1 male, 373 ambiguous; 418 founders) loaded from 100447 variants loaded from europe/output/file5b.bim. europe/output/file11b.king.cutoff.out.id , and 410 remaining sample IDs written

# we set a window of variants of 5 and move the window 1 variant each time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. We used --indep-pairwise 5 1 0.1 before. We can use the same values from the mean half distance max r2
plink2 \
--allow-extra-chr \
--bfile europe/output/file11b \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP_scaffoldsb \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_scaffoldsb.log

–indep-pairwise 5 1 0.1 410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from 100447 variants loaded from europe/output/file11b.bim. –indep-pairwise (28 compute threads): 40637/100447 variants removed.

Now we can compare the two sets of SNPs using scaffolds or chromosomal scale

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_chrb.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_scaffolds <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_scaffoldsb.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_scaffolds <- unlist(prunout_scaffolds)
prunout_chr <- unlist(prunout_chr)


# Calculate shared values
prunout <- intersect(prunout_chr, prunout_scaffolds)

# Create Venn diagram
venn_data1 <- list(
  "Chromosomal" = prunout_chr,
  "Scaffolds" = prunout_scaffolds
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of genomic scales for linked SNPs MAF 0.01") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("output", "europe", "figures", "SNPs_linked_comparison_europe_b.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

Now compare MAF 0.01 to MAF 0.1 pruning Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr_0.01 <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_chrb.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_chr_0.1 <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_chr.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_chr_0.01 <- unlist(prunout_chr_0.01)
prunout_chr_0.1 <- unlist(prunout_chr_0.1)


# Calculate shared values
prunout <- intersect(prunout_chr_0.01, prunout_chr_0.1)

# Create Venn diagram
venn_data1 <- list(
  "MAF 1%" = prunout_chr_0.01,
  "MAF 10%" = prunout_chr_0.1
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of Europe SNPs sets pruned for MAF 1% and MAF 10%") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("output", "europe", "figures", "SNPs_comparison_europe_MAFs.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

9.11. Pruning r2 for LD-pruned datasets with MAF 1%

If needed, can create it again using the code below

plink2 \
--allow-extra-chr \
--bfile europe/output/file7b \
--indep-pairwise 5 1 0.1 \
--out europe/output/indepSNP_chrb \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_chrb.log

–indep-pairwise 5 1 0.1 410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from 100447 variants loaded from europe/output/file7b.bim. –indep-pairwise (3 compute threads): 40644/100447 variants removed.

The indepSNP_chrb.prune.in file produced here = those SNPs < our LD 0.1 threshold, that we want to use

Now we need to make pruned dataset for 0.01. Can use file7b

plink2 \
--allow-extra-chr \
--bfile europe/output/file7b \
--indep-pairwise 5 1 0.01 \
--out europe/output/indepSNP_chrb_r01 \
--silent;
grep 'pairwise\|variants\|samples' europe/output/indepSNP_chrb_r01.log

–indep-pairwise 5 1 0.01 410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from 100447 variants loaded from europe/output/file7b.bim. –indep-pairwise (3 compute threads): 79479/100447 variants removed.

We can use the SNP set the LD pruning we just did and extract them from file7 to get (Set 1)

plink2 \
--keep-allele-order \
--bfile europe/output/file7b \
--make-bed \
--export vcf \
--out europe/output/snps_sets/r2_0.01_b \
--extract europe/output/indepSNP_chrb_r01.prune.in \
--silent
grep "variants\|samples" europe/output/snps_sets/r2_0.01_b.log

410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from 100447 variants loaded from europe/output/file7b.bim. –extract: 20968 variants remaining. 20968 variants remaining after main filters.

Repeat for 0.1 dataset (Set 2)

plink2 \
--keep-allele-order \
--bfile europe/output/file7b \
--make-bed \
--export vcf \
--out europe/output/snps_sets/r2_0.1_b \
--extract europe/output/indepSNP_chrb.prune.in \
--silent
grep "variants\|samples" europe/output/snps_sets/r2_0.1_b.log

410 samples (41 females, 1 male, 368 ambiguous; 410 founders) loaded from 100447 variants loaded from europe/output/file7b.bim. –extract: 59803 variants remaining. 59803 variants remaining after main filters.

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr_0.01 <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_chrb_r01.prune.in"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_chr_0.1 <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/indepSNP_chr_r01.prune.in"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_chr_0.01 <- unlist(prunout_chr_0.01)
prunout_chr_0.1 <- unlist(prunout_chr_0.1)


# Calculate shared values
prunout <- intersect(prunout_chr_0.01, prunout_chr_0.1)

# Create Venn diagram
venn_data1 <- list(
  "MAF 1%" = prunout_chr_0.01,
  "MAF 10%" = prunout_chr_0.1
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of SNP sets for Europe dataset") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("output", "europe", "figures", "SNPs_comparison_europe_MAFs_r2_0.01.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)