1. Load R libraries and software
- 1.2 Get software for quality control
- 1.3 Check if the data is in the correct location
2. The data
3. Quality control steps
4. Create chromosomal scale
- 4.1 Estimate LD blocks using chromosomal scale
6. Plot SNP density
9. LD pruning using the chromosomal scale
11. Pruning r2 for LD-pruned datasets
12. Run PCA with LD-pruned dataset (for SNP Set 2)
13. Make a new snp set with Minor Allele Frequency of 1% (MAF 0.01)

1. Load R libraries and software

library(tidyverse)

## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1

library(here)
library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
library(reticulate)
#library(scales)
library(stringr)
library(grid)
library(flextable)
library(devtools)
library(readr)
library(purrr)
library(ggtext)
library(ggvenn)
#library(data.table)
library(RColorBrewer)
library(ggrepel)

1.2 Get software for quality control

Check your Plink version.

module spider plink
#Versions on the cluster:
        #LINK/1.9b_6.21-x86_64
        #LINK/2_avx2_20221024
module load PLINK/2_avx2_20221024

plink2 --version
#PLINK v2.00a3.7LM AVX2 Intel (24 Oct 2022)

Check your BCFtools version.

module spider BCFtools
#BCFtools/1.16-GCCcore-10.2.0

module load BCFtools/1.16-GCCcore-10.2.0

1.3 Check if the data is in the correct location

ls data/europe/* # we use * to truncate the name of the file showing all files

cd /gpfs/gibbs/pi/caccone/mkc54/albo

## data/europe/albo_euro_global_1Dec2023.log
## data/europe/albo_euro_global_1Dec2023.vcf
## data/europe/albo_europe_11Sep2023.log
## data/europe/albo_europe_11Sep2023.vcf
## data/europe/euro_asia_8america_4africa_28Nov2023.log
## data/europe/euro_asia_8america_4africa_28Nov2023.vcf
## data/europe/europe_native_11Sep2023.log
## data/europe/europe_native_11Sep2023.vcf

We can check how many sample names we have in our vcf

# make sure you have all the .CEL samples in your family file
bcftools query -l /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe/albo_euro_global_1Dec2023.vcf | wc -l
#712

2. The data

2.1 Use Plink2 to convert to bed format

cd /gpfs/gibbs/pi/caccone/mkc54/albo

plink2 \
--allow-extra-chr \
--vcf data/europe/albo_euro_global_1Dec2023.vcf \
--const-fid \
--make-bed \
--exclude europe/output/files/albopictus_SNPs_fail_segregation.txt \
--fa /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz \
--ref-from-fa 'force' `# sets REF alleles when it can be done unambiguously, we use force to change the alleles` \
--out euro_global/output/file1 \
--silent;
# --keep-allele-order \ if you use Plink 1.9
grep "variants" euro_global/output/file1.log; # to get the number of variants from the log file.

–vcf: 113823 variants scanned. 113823 variants loaded from euro_global/output/file1-temporary.pvar.zst. –exclude: 112349 variants remaining. 112349 variants remaining after main filters. –ref-from-fa force: 0 variants changed, 112349 validated.

Check the fam file

head euro_global/output/file1.fam

## OKI  1001    0   0   2   -9
## OKI  1002    0   0   2   -9
## OKI  1003    0   0   2   -9
## OKI  1004    0   0   2   -9
## OKI  1005    0   0   2   -9
## OKI  1006    0   0   1   -9
## OKI  1007    0   0   1   -9
## OKI  1008    0   0   1   -9
## OKI  1009    0   0   1   -9
## OKI  1010    0   0   1   -9

head -n 5 scripts/RMarkdowns/sample_ped_info_ALLPOPS_1Apr2024.txt

## Sample Filename  Family_ID   Individual_ID   Father_ID   Mother_ID   Sex Affection Status
## 1_LabCross_KF2.CEL   LAB 1   0   0   2   -9
## 2_LabCross_MF81.CEL  LAB 2   0   0   2   -9
## 3_LabCross_MF71.CEL  LAB 3   0   0   2   -9
## 4_LabCross_KF42.CEL  LAB 4   0   0   2   -9

2.2 Use R to update the .fam file

Import the fam file we use with Axiom Suite

# the order of the rows in this file does not matter
samples <-
  read.delim(
    file   = here("scripts", "RMarkdowns", "sample_ped_info_ALLPOPS_fixed.txt"
    ),
    header = TRUE
  )
head(samples)

##     Sample.Filename Family_ID Individual_ID Father_ID Mother_ID Sex
## 1  8_MAN_Brazil.CEL       MAU             8         0         0   0
## 2  9_MAN_Brazil.CEL       MAU             9         0         0   0
## 3 16_MAN_Brazil.CEL       MAU            16         0         0   0
## 4 17_MAN_Brazil.CEL       MAU            17         0         0   0
## 5 18_MAN_Brazil.CEL       MAU            18         0         0   0
## 6 60_MAN_Brazil.CEL       MAU            60         0         0   0
##   Affection.Status
## 1               -9
## 2               -9
## 3               -9
## 4               -9
## 5               -9
## 6               -9

Import .fam file we created once we created the bed file using Plink2

#The fam file is the same for both data sets with the default or new priors
fam1 <-
  read.delim(
    file   = here("euro_global/output/file1.fam"),
    header = FALSE,
    
  )
head(fam1)

##    V1   V2 V3 V4 V5 V6
## 1 OKI 1001  0  0  2 -9
## 2 OKI 1002  0  0  2 -9
## 3 OKI 1003  0  0  2 -9
## 4 OKI 1004  0  0  2 -9
## 5 OKI 1005  0  0  2 -9
## 6 OKI 1006  0  0  1 -9

We can merge the tibbles.

# Extract the number part from the columns
fam1_temp <- fam1 |>
  mutate(num_id = as.numeric(str_extract(V2, "^\\d+")))

samples_temp <- samples |>
  mutate(num_id = as.numeric(str_extract(Sample.Filename, "^\\d+")))

# Perform the left join using the num_id columns and keep the order of fam1
df <- fam1_temp |>
  dplyr::left_join(samples_temp, by = "num_id") |>
  dplyr::select(-num_id) |>
  dplyr::select(8:13)

# check the data frame
head(df)

##   Family_ID Individual_ID Father_ID Mother_ID Sex Affection.Status
## 1       OKI          1001         0         0   2               -9
## 2       OKI          1002         0         0   2               -9
## 3       OKI          1003         0         0   2               -9
## 4       OKI          1004         0         0   2               -9
## 5       OKI          1005         0         0   2               -9
## 6       OKI          1006         0         0   1               -9

We can check how many samples we have in our file

nrow(df)

## [1] 712

#712

Before you save the new fam file, you can change the original file to a different name, to compare the order later. If you want to repeat the steps above after you saving the new file1.fam, you will need to import the vcf again.

# Save and override the .fam file for dp
write.table(
  df,
  file      = here("euro_global/output/file1.fam"),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Check the new .fam file to see if has the order and the sample attributes we want.

# you can open the file on a text editor and double check the sample order and information.
head -n 5 euro_global/output/file1.fam

## OKI  1001    0   0   2   -9
## OKI  1002    0   0   2   -9
## OKI  1003    0   0   2   -9
## OKI  1004    0   0   2   -9
## OKI  1005    0   0   2   -9

2.3 Remove duplicates

We have 4 samples genotyped 3 times each. We can keep only one of them

We can check for the triplicates in our dataset

# Check triplicates
grep "a\|b\|c" euro_global/output/file1.fam

To subset the data we need to create a list of samples with family id and individual ids

grep "b\|c" euro_global/output/file1.fam | awk '{print $1, $2}' > euro_global/output/files/duplicates_to_remove.txt;
head euro_global/output/files/duplicates_to_remove.txt

Create a new bed removing the duplicated samples. We can also make sure the reference alleles match the reference genome

# I created a fam file with the information about each sample, but first we import the data and create a bed file setting the family id constant
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file1 \
--make-bed \
--fa /gpfs/gibbs/project/caccone/mkc54/albo/genome/albo.fasta.gz \
--ref-from-fa 'force' `# sets REF alleles when it can be done unambiguously, we use force to change the alleles` \
--exclude europe/output/files/albopictus_SNPs_fail_segregation.txt \
--remove euro_global/output/files/duplicates_to_remove.txt \
--out euro_global/output/file1b \
--silent;
# --keep-allele-order \ if you use Plink 1.9
grep "samples\|variants" euro_global/output/file1b.log # to get the number of variants from the log file.

712 samples (85 females, 70 males, 557 ambiguous; 712 founders) loaded from 112349 variants loaded from euro_global/output/file1.bim. –exclude: 112349 variants remaining. –remove: 708 samples remaining. 708 samples (85 females, 70 males, 553 ambiguous; 708 founders) remaining after 112349 variants remaining after main filters. –ref-from-fa force: 0 variants changed, 112349 validated.

Check the headings of the the files we will work on.

head -n 5 euro_global/output/file1b.fam

## OKI  1001    0   0   2   -9
## OKI  1002    0   0   2   -9
## OKI  1003    0   0   2   -9
## OKI  1004    0   0   2   -9
## OKI  1005    0   0   2   -9

Check how many samples are in the .fam file

wc -l euro_global/output/file1b.fam
#708

## 708 euro_global/output/file1b.fam

2.4 Checking the number of samples and localities

This part is very important. Make sure there are no NAs or something that is not what you would expect. For example, the number of mosquitoes per population.

awk '{print $1}' euro_global/output/file1b.fam | sort | uniq -c | awk '{print $2, $1}'

## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 17
## FRS 12
## GEL 2
## GES 12
## GRA 12
## GRC 10
## GRV 12
## HAI 12
## HAN 4
## HOC 7
## HUN 12
## IMP 4
## INJ 11
## INW 4
## ITB 6
## ITP 10
## ITR 12
## JAF 2
## KAC 6
## KAG 12
## KAN 12
## KAT 10
## KER 12
## KLP 4
## KRA 12
## KUN 4
## LAM 9
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POL 2
## POP 12
## QNC 12
## RAR 12
## REC 12
## ROM 4
## ROS 12
## SER 4
## SEV 12
## SIC 12
## SLO 12
## SOC 12
## SON 3
## SPB 8
## SPC 6
## SPM 6
## SPS 9
## SSK 12
## STS 12
## SUF 6
## SUU 6
## TAI 7
## TIK 12
## TIR 4
## TRE 12
## TUA 12
## TUH 12
## UTS 12
## YUN 9

3. Quality control steps

3.1 Missingness

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file1b \
--geno 0.1                `# we set genotype missiningness to 10% with this option` \
--make-bed \
--out euro_global/output/file2 \
--silent \
--missing;                 # --missing produces sample-based and variant-based missing data reports. If run with --within/--family, the variant-based report is stratified by cluster.
grep "variants" euro_global/output/file2.log

112349 variants loaded from euro_global/output/file1b.bim. –geno: 8526 variants removed due to missing genotype data. 103823 variants remaining after main filters.

Make plot

#   ____________________________________________________________________________
#   import individual missingness                                           ####
indmiss <-                                              # name of the data frame we are creating
  read.delim(                                           # use function read table
    file   = here(
      "euro_global/output/file2.smiss"
    ),                                                  # we use library here to import file2.imiss from data/QC
    header = TRUE                                       # we do have headers in our file
  )
#   ____________________________________________________________________________
#   import SNP missingness                                                  ####
snpmiss <-
  read.delim(
    file   = here(
      "euro_global/output/file2.vmiss"
    ),
    header = TRUE
  )
#

Plot individual missingness

# load plotting theme
source(
  here("scripts", "RMarkdowns", "my_theme2.R"
  )
)

ggplot( # Start a ggplot object with the data and aesthetic mappings
  indmiss,
  aes(
    x = F_MISS
  )
) +
  geom_histogram( # Add a histogram layer
    color            = "black",
    fill             = "lightgray",
    bins             = 6
  ) +
  geom_text(
    # Add text labels for bin counts
    stat             = "bin",
    aes(
    label = after_stat(count)
  ),
    vjust            = -0.5,
    color            = "black",
    size             = 3,
    bins             = 6
  ) +
  geom_vline(
    # Add a vertical line at the mean of F_MISS
    aes(
    xintercept = mean(F_MISS)),
    color            = "red",
    linetype         = "dotted",
    linewidth        = .5
  ) +
  geom_text(
    # Add a text label for the mean of F_MISS
    aes(
      x = mean(F_MISS),
      y = 75,
      label = paste0(
        "Mean \n",
        scales::percent(mean(F_MISS),
          accuracy = 0.01
        )
      )
    ),
    size = 3,
    color = "red",
    hjust = -.1
  ) +
  labs( # Add axis labels
    x                = "Individual Missingness (%)",
    y                = "Count (n)"
  ) +
  my_theme() +
  scale_x_continuous( # Scale the x-axis to display percentages
    labels           = scales::percent,
    n.breaks         = 6
  )

## Warning in geom_text(aes(x = mean(F_MISS), y = 75, label = paste0("Mean \n", : All aesthetics have length 1, but the data has 708 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

#
# save the plot
ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "figures" , "individual_missingness10_euro_global.pdf"
  ),
  width              = 7,
  height             = 5,
  units              = "in"
)

# Define a function to customize the theme
my_theme <- function() {
  theme_minimal(base_size = 12, base_family = "") +
    theme(
      panel.grid.major = element_line(
        linetype = "dashed",
        linewidth = 0.2,
        color = "gray"
      ),
      panel.grid.minor = element_line(
        linetype = "dashed",
        linewidth = 0.2,
        color = "gray"
      ),
      # Customize the x-axis label
      axis.title.x = element_text(
        angle          = 0,
        hjust          = 1,
        face           = "bold"
      ),
      # Customize the y-axis label
      axis.title.y = element_text(
        angle          = 90,
        hjust          = 1,
        face           = "bold"
      )
    )
}
# we can save the function to source it later
dump(                                                    # check ?dump for more information
  "my_theme",                                            # the object we want to save
  here("scripts", "RMarkdowns", 
   "my_theme2.R")                # use here to save it our function as .R 
)

Plot variant missingness

# This plot takes a while to compute
# This code creates a histogram from the snpmiss data set using the F_MISS column.
# ggplot builds a histogram of individual missingness data
ggplot(
  snpmiss,
  aes(
    x = F_MISS
  )
) +
  geom_histogram(
    color = "black",
    fill = "lightgray",
    bins = 6
  ) +
  stat_bin(
    geom = "text",
    aes(
      label = format(
        after_stat(count),
        big.mark = ",",
        scientific = FALSE
      )
    ),
    vjust = -0.5,
    color = "black",
    size = 2,
    bins = 6
  ) +
  geom_vline(
    aes(
      xintercept = mean(F_MISS)
    ),
    color = "red",
    linetype = "dotted",
    linewidth = 0.5
  ) +
  geom_text(
    aes(
      x = mean(F_MISS),
      y = 16000,
      label = paste0(
        "Mean \n",
        scales::percent(mean(F_MISS),
          accuracy = 0.01
        )
      )
    ),
    size = 3,
    color = "red",
    # hjust = 1.5,
    vjust = -.2
  ) +
  labs(
    x = "Variant Missingness (%)",
    y = "Count (n)"
  ) +
  # theme_minimal(
  #   base_size = 12,
  #   base_family = "Roboto Condensed"
  # ) +
  scale_x_continuous(
    labels = scales::percent,
    n.breaks = 6
  ) +
  scale_y_continuous(
    labels = scales::label_comma(),
    n.breaks = 5
  ) +
  my_theme()

## Warning in geom_text(aes(x = mean(F_MISS), y = 16000, label = paste0("Mean \n", : All aesthetics have length 1, but the data has 112349 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# save the plot
ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "figures", "SNPs_missingness10_euro_global.pdf"
  ),
  width  = 7,
  height = 5,
  units  = "in"
)

Remove individuals missing more than 20% of SNPs. You can use the threshold you want, change the flag –mind

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file2 \
--mind 0.2               `# here we set the individual missingness threshold of 20%`\
--make-bed \
--out euro_global/output/file3 \
--silent;
grep "samples\|variants" euro_global/output/file3.log

708 samples (85 females, 70 males, 553 ambiguous; 708 founders) loaded from 103823 variants loaded from euro_global/output/file2.bim. 0 samples removed due to missing genotype data (–mind). 708 samples (85 females, 70 males, 553 ambiguous; 708 founders) remaining after

3.2 Minor allele frequency

First, we estimate the allele frequency with Plink.

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file3 \
--freq \
--out euro_global/output/MAF_check \
--silent

Now we apply the MAF filter.

# We will use MAF of 10%
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file3 \
--maf 0.1 \
--make-bed \
--out euro_global/output/file4 \
--silent;
grep "variants" euro_global/output/file4.log

103823 variants loaded from euro_global/output/file3.bim. 16640 variants removed due to allele frequency threshold(s) 87183 variants remaining after main filters.

Then we plot it with ggplot.

#   Import MAF data                                                         ####
maf_freq <-
  read.delim(
    here(
      "euro_global/output/MAF_check.afreq"
    ),
    header = TRUE
  )

Make MAF plot

#   make the plot                                                           ####
ggplot(
  maf_freq,
  aes(ALT_FREQS)
) +
  geom_histogram(
    colour = "black",
    fill = "lightgray",
    bins = 40
  ) +
  labs(
    x = "Minor Allele Frequency (MAF)",
    y = "Count (n)",
    caption = "<span style='color:red;'><i>Red</i></span> <span style='color:black;'><i>line at</i></span><span style='color:red;'><i> MAF 10%</i></span><span style='color:black;'><i> threshold</i></span>."
  ) +
  geom_text(
    aes(
      x = .1,
      y = 8000,
      label = paste0("16,640 SNPs")
    ),
    size = 3,
    color = "red",
    vjust = -.2
  ) +
  geom_vline(xintercept = 0.1, color = "red") +
  my_theme() +
  theme(plot.caption = element_markdown()) +
  scale_y_continuous(label = scales::number_format(big.mark = ",")) +
  scale_x_continuous(breaks = c(0, 0.1, 0.2, 0.4, 0.6, 0.8, 1))

## Warning in geom_text(aes(x = 0.1, y = 8000, label = paste0("16,640 SNPs")), : All aesthetics have length 1, but the data has 103823 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# save the plot
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "figures", "MAF_freq_plot_euro_global.pdf"
  ),
  width  = 7,
  height = 5,
  units  = "in"
)

We removed 16,640 variants due to the MAF filter. Next we will excludes markers which deviate from Hardy–Weinberg equilibrium (HWE). It is a common indicator of genotyping error, but may also indicate evolutionary selection. We have to do it for each population individually. We cannot do it for all populations at once. Therefore, the first step is create a new bed file with Plink keeping only one population. I like to create a new directory and name it “hardy”, and copy the “file4” there.

mkdir -p euro_global/output/files/hardy;
cp euro_global/output/file4.* euro_global/output/files/hardy/

3.3 HWE test

Now we can run the HWE test. However, we will need to apply the SNP missingness again for each population. If we do not, the HWE will vary widely. With the bash script below, we will create a new file for each population, run the HWE test with HWE p value <1e‐6 (HWE p value <1e‐6). Then, we ask Plink to generate a list of SNPs that passed the test for each population.

for fam in $(awk '{print $1}' euro_global/output/files/hardy/file4.fam | sort | uniq); 
do 
echo $fam | \
plink2 \
--allow-extra-chr \
--silent \
--keep-allele-order \
--bfile euro_global/output/files/hardy/file4 \
--keep-fam /dev/stdin \
--make-bed \
--out euro_global/output/files/hardy/$fam \
--hwe 0.000001 \
--geno 0.1 \
--write-snplist; \
done

Next, we use “cat” and “awk” to concatenate the SNP list from all populations, and remove duplicates. Once we have a list of SNPs that passed the test for each population, we can use Plink to create a new bed file keeping only the SNPs that passed the test in each population. First, lets get the list of SNPs, and count how many passed:

cd /gpfs/gibbs/pi/caccone/mkc54/albo
cat euro_global/output/files/hardy/*.snplist | awk '!a[$0]++' > euro_global/output/files/passed_hwe.txt;
wc -l euro_global/output/files/passed_hwe.txt

## 87183 euro_global/output/files/passed_hwe.txt

All variants passed HWE test. If some failed, next time we could remove the variants that did not pass HWE test, using the –extract flag, extracting only those that passed HWE.

3.4 Linkage analysis

We can analyse the LD patterns for each populations using different approaches. If we want to look at general patterns we can get an estimate for each population, for example, the half of the distance of the r2 max. We can use Plink2 to estimates some statistics for us. One important consideration to is how good is our genome assembly. Does it matter if we use the 574 scaffolds or assign each scaffold to a chromosome or not. So, if you want to know if it matters, you would need to create a new chromosomal scale and perform the calculations. Since, we want to get only an general approximate estimate of the linkage levels, we can look at the largest scaffolds. Since chromosome 1 has the M locus, we will get the largest scaffold of chromosome 2 and 3. We create a new chromosomal scale when we want to plot the data. For now, we can import a file that LC created with the size of each scaffold (data/genome/scaffold_sizes.txt). The code below imports the file and get the 3 largest scaffolds from each chromosome.

#   import the file with the scaffold sizes                                 ####
scaffold_bps <-
  read_delim(
    here("scripts", "RMarkdowns",
      "scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cn"
  )
#
# set column names
colnames(
  scaffold_bps
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <-
  scaffold_bps |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1",              # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  )
#
# group by Chromosome and get row with highest Size value
df_max <-
  sizes |>
  group_by(
    Chromosome
  ) |>
  slice_max(
    Size
  )
#
# print result
df_max |>
  mutate(
    Size_Mb = Size/1e6
  )

## # A tibble: 3 × 4
## # Groups:   Chromosome [3]
##   Scaffold     Size Chromosome Size_Mb
##   <chr>       <dbl> <chr>        <dbl>
## 1 1.85     28902815 1             28.9
## 2 2.17     43762705 2             43.8
## 3 3.75     25123842 3             25.1

The largest scaffolds of chromosome 2 is 43Mb, and chromosome 3 is 25Mb. We can also take a look at the size distribution of the scaffolds using ggplot2.

# create histogram using ggplot2
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

ggplot(
  sizes,
  aes(
    x = Size
  )
) +
  geom_histogram(
    binwidth = 1e6,
    aes(fill = factor(
      Chromosome
    )),
    color = "lightgray"
  ) +
  labs(
    title = "Scaffold sizes per chromosome",
    x = "Scaffold Size (Mb)",
    y = "Count (n)"
  ) +
  # xlab("Scaffold Size (Mb)") +
  # ylab("Count (n)") +
  my_theme() +
  scale_x_continuous(
    labels = comma_format(
      scale = 1e-6
    )
  ) +
  facet_wrap(
    ~Chromosome,
    ncol = 3,
    scales = "free_x"
  ) +
  scale_fill_manual(
    values = c(
      "pink", "green", "orange"
    )
  ) +
  guides(fill = "none")

Now, we can create a list of SNPs from these two large scaffolds. We hope they will give us an average LD statistic that represents the entire genome. We can import the .bim file and get the list of variants from scaffolds 2.17 (43Mb) and 3.75 (25Mb)

Import the .bim file with the SNPs

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "euro_global/output/file4.bim" #output/populations/file4.bim
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)

## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-583035163     0   315386 A       G      
## 2 1.1      AX-583033356     0   315674 C       T      
## 3 1.1      AX-583033370     0   330057 G       A      
## 4 1.1      AX-583035194     0   330265 A       G      
## 5 1.1      AX-583033387     0   331288 C       T      
## 6 1.10     AX-583035257     0    91677 T       C

We can write a function to import the bim files.

#   function to import bim files                                            ####
#
import_bim <- function(file) {
  # import as a tibble and set columns as integers
  bim <-
    read_delim(
      file,
      col_names      = FALSE,
      show_col_types = FALSE,
      col_types      = "ccidcc"
    )
  # rename the columns by index
  bim <- bim |>
    rename(
      Scaffold = 1,
      SNP      = 2,
      Cm       = 3,
      Position = 4,
      Allele1  = 5,
      Allele2  = 6
    )
  return(bim)
}
# we can save the function to source it later
dump(                                                     # check ?dump for more information
  "import_bim",                                           # the object we want to save
  here("scripts", "RMarkdowns",
    "analyses", "import_bim.R")       # use here to save it our function as .R 
)

Now we can create the list of SNPs we want to use for the general linkage analysis.

snps_4ld <-
  snps |>
  filter( # scaffolds 2.17 (43Mb) and 3.75 (25Mb)
    Scaffold == "2.17" | Scaffold == "3.75"
  ) |> 
  select(
    "Scaffold",
    "SNP"
  )
nrow(snps_4ld)

## [1] 5898

We have 5916 SNPs to estimate LD that are located in the two largest scaffolds of chromosome 2 and 3.

#   save calculation to load later                                          ####
write.table(
  snps_4ld,
  file      = here(
    "euro_global/output/files/ld/snps_4ld.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Now we have to select the populations for which we want to estimate the LD statistic. Let only estimate LD for populations that we have 12 or more individuals.

awk '{print $1}' euro_global/output/file4.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 12 {print}'

## ALU 12
## ALV 12
## BAR 12
## BEN 12
## BER 12
## BRE 13
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 17
## FRS 12
## GES 12
## GRA 12
## GRV 12
## HAI 12
## HUN 12
## ITR 12
## KAG 12
## KAN 12
## KER 12
## KRA 12
## MAL 12
## MAT 12
## OKI 12
## POP 12
## QNC 12
## RAR 12
## REC 12
## ROS 12
## SEV 12
## SIC 12
## SLO 12
## SOC 12
## SSK 12
## STS 12
## TIK 12
## TRE 12
## TUA 12
## TUH 12
## UTS 12

Now we can create a list of populations that have 12 or more individuals. We can use the command above but only print the list of families. We can create a new directory for the LD analysis.

Create list of populations

awk '{print $1}' euro_global/output/file4.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 12 {print}' | awk '{print $1}' > euro_global/output/files/ld/pops_4ld.txt

3.4.1 Half distance r^2 max

Now we can use Plink1.9 to estimate LD and then bin the data

#load plink 1.9
module load PLINK/1.9b_6.21-x86_64
plink --version
#PLINK v1.90b6.21 64-bit (19 Oct 2020)

for fam in $(awk '{print $1}' euro_global/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--extract euro_global/output/files/ld/snps_4ld.txt \
--bfile euro_global/output/file4 \
--keep-fam /dev/stdin \
--maf 0.1 \
--r2 \
--out euro_global/output/files/ld/$fam \
--geno 0.2 \
--write-snplist \
--silent
done;
#
rm euro_global/output/files/ld/*.nosex

Once we subset the bed files per population, we set a MAF and genotype missingness of 10% as before but now we have smaller sample size and not all populations will have all the variants.

import os

# Replace with the directory containing the .snplist files
directory = "euro_global/output/files/ld"

# Get a list of all .snplist files in the directory
file_list = [filename for filename in os.listdir(directory) if filename.endswith(".snplist")]

# Create a set to store the unique variants
unique_variants = set()

# Loop over each file and add the variants to the set
for filename in file_list:
    with open(os.path.join(directory, filename), "r") as file:
        for line in file:
            unique_variants.add(line.strip())

# Create a dictionary to store the number of files each variant appears in
variant_counts = {variant: 0 for variant in unique_variants}

# Loop over each file again and count the variants
for filename in file_list:
    with open(os.path.join(directory, filename), "r") as file:
        for line in file:
            variant = line.strip()
            variant_counts[variant] += 1

# Calculate the threshold for the number of files a variant must appear in
threshold = int(len(file_list) * 0.50)

# Get the list of unique variants that appear in at least 80% of files
all_variants = [variant for variant, count in variant_counts.items() if count >= threshold]

# Save the list of variants to a new file
with open("euro_global/output/files/ld/all_unique_variants_50pct.txt", "w") as file:
    file.write("\n".join(all_variants))
    
#64297

## 64297

Check how many SNPs

cat euro_global/output/files/ld/all_unique_variants_50pct.txt | sort | awk '!seen[gensub(/^[[:space:]]+|[[:space:]]+$/, "", "g")]++' | wc -l

## 4946

Now we can estimate LD again using the 4,946 variants that are present in at least 50% of the populations.

Now we can use Plink1.9 to estimate LD

for fam in $(awk '{print $1}' euro_global/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--extract euro_global/output/files/ld/all_unique_variants_50pct.txt \
--bfile euro_global/output/file4 \
--keep-fam /dev/stdin \
--maf 0.1 \
--r2 \
--out euro_global/output/files/ld/$fam \
--geno 0.2 \
--write-snplist \
--silent
done;
#
rm euro_global/output/files/ld/*.nosex

Lets check one of the files

head /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/files/ld/ALU.ld

##  CHR_A BP_A SNP_A CHR_B BP_B SNP_B R2 
##  2.17 56405 AX-584670672 2.17 70977 AX-584670684 0.571429 
##  2.17 56405 AX-584670672 2.17 71211 AX-585179355 0.308571 
##  2.17 56405 AX-584670672 2.17 72138 AX-584670713 0.326531 
##  2.17 56405 AX-584670672 2.17 107547 AX-585179519 0.204082 
##  2.17 56405 AX-584670672 2.17 115267 AX-585179548 0.344234 
##  2.17 56405 AX-584670672 2.17 124491 AX-585179568 0.232126 
##  2.17 56405 AX-584670672 2.17 146817 AX-584670926 0.205882 
##  2.17 70977 AX-584670684 2.17 71211 AX-585179355 0.24 
##  2.17 70977 AX-584670684 2.17 72138 AX-584670713 0.223214

Clean the LD files from Plink

ld_files <- list.files(path = "euro_global/output/files/ld", pattern = "\\.ld$", full.names = TRUE)

for (file in ld_files) {
  system(paste("awk -i inplace '{gsub(/[[:blank:]]+/, \" \")}1'", file))
}

Lets check one of the files

head euro_global/output/files/ld/ALU.ld

##  CHR_A BP_A SNP_A CHR_B BP_B SNP_B R2 
##  2.17 56405 AX-584670672 2.17 70977 AX-584670684 0.571429 
##  2.17 56405 AX-584670672 2.17 71211 AX-585179355 0.308571 
##  2.17 56405 AX-584670672 2.17 72138 AX-584670713 0.326531 
##  2.17 56405 AX-584670672 2.17 107547 AX-585179519 0.204082 
##  2.17 56405 AX-584670672 2.17 115267 AX-585179548 0.344234 
##  2.17 56405 AX-584670672 2.17 124491 AX-585179568 0.232126 
##  2.17 56405 AX-584670672 2.17 146817 AX-584670926 0.205882 
##  2.17 70977 AX-584670684 2.17 71211 AX-585179355 0.24 
##  2.17 70977 AX-584670684 2.17 72138 AX-584670713 0.223214

Function to import LD data

import_ld_files <- function(directory) {
  ld_files <-
    list.files(
      path = directory,
      pattern = "\\.ld$",
      full.names = TRUE
    )

  ld_tibbles <- list()

  for (file in ld_files) {
    ld_name <- gsub(".ld", "", basename(file))
    ld_data <- read_delim(
      file,
      col_names = TRUE,
      delim = " ",
      show_col_types = FALSE
    ) %>%
      select(c("CHR_A", "BP_A", "SNP_A", "CHR_B", "BP_B", "SNP_B", "R2"))
    ld_tibbles[[ld_name]] <- ld_data
  }

  return(ld_tibbles)
}

# we can save the function to source it later
dump(                                                     # check ?dump for more information
  "import_ld_files",                                           # the object we want to save
  here("scripts", "RMarkdowns",
    "analyses", "import_ld_files.R")       # use here to save it our function as .R 
)

Import the LD data

my_ld_tibbles <- import_ld_files("euro_global/output/files/ld/")


head(my_ld_tibbles$ALV)

## # A tibble: 6 × 7
##   CHR_A  BP_A SNP_A        CHR_B   BP_B SNP_B           R2
##   <dbl> <dbl> <chr>        <dbl>  <dbl> <chr>        <dbl>
## 1  2.17 70977 AX-584670684  2.17 161344 AX-584670951 0.424
## 2  2.17 70977 AX-584670684  2.17 193047 AX-584670995 0.257
## 3  2.17 70977 AX-584670684  2.17 201334 AX-585179663 0.490
## 4  2.17 70977 AX-584670684  2.17 213672 AX-584671067 0.5  
## 5  2.17 70977 AX-584670684  2.17 213933 AX-585179740 1    
## 6  2.17 70977 AX-584670684  2.17 217433 AX-584671128 0.5

Estimate half distance of maximum R2 value.

Since we have several populations, it is better to write a function

get_half_pop <- function(ld_tibble) {
  # Split the tibble into sub-tibbles by scaffold
  ld_split <- split(ld_tibble, ld_tibble[, 1])
  # Remove rows with missing data
  ld_no_nan <- lapply(ld_split, function(x) {
    x[complete.cases(x), ]
  })
  # Calculate the distance between SNPs
  ld_dist <-
    lapply(ld_no_nan, function(x) {
      cbind(x, x[, 5] - x[, 2])
    })
  # Sort the sub-tibbles by distance
  ld_dist_order <- lapply(ld_dist, function(x) {
    x[order(x[, 8]), ]
  })
  # Combine the sub-tibbles into one tibble
  ld_proc <- do.call(rbind, ld_dist_order)
  # Sort the tibble by distance
  ld_sorted <- ld_proc[order(ld_proc[, 8]), ]
  # Bin the data by distance
  bin.1000 <-
    cut(ld_sorted[, 8], (c(0:1000) * 1000), include.lowest = TRUE)
  # Calculate the mean R2 for each bin
  ld_bin_1000_sorted <-
    tapply(ld_sorted[, 7], bin.1000, function(x) {
      mean(x)
    })
  # Fit a LOESS curve to the binned data
  lo.1000.ld <- loess(ld_bin_1000_sorted ~ c(1:1000))
  # Find the value of the binned R2 that corresponds to half the maximum R2 value
  d.pop <- max(ld_bin_1000_sorted, na.rm = TRUE)
  half.pop <- which(ld_bin_1000_sorted <= (d.pop / 2))[1]
  return(half.pop)
}
# we can save the function to source it later
dump(
  "get_half_pop",
  here("scripts", "RMarkdowns",
    "analyses", "get_half_pop.R")
)

Test the function, it should return the same value as we got above

# for one population, you can change the population name to see other populations
half_alv <- get_half_pop(my_ld_tibbles$ALV)
half_alv

## [0,1e+03] 
##         1

Estimates for all populations

# Create an empty data.frame to store the results
half_pops_df <-
  data.frame(
    row.names = names(my_ld_tibbles),
    half_distance = numeric(length(my_ld_tibbles)),
    stringsAsFactors = FALSE
  )

# Calculate the hald distance of max r^2 value for each tibble and store in the data.frame
for (tib_name in names(my_ld_tibbles)) {
  tib_half <- get_half_pop(my_ld_tibbles[[tib_name]])
  half_pops_df[tib_name, "half_distance"] <- tib_half
}

# Print the resulting data.frame
half_pops_df

##     half_distance
## ALU             1
## ALV             1
## BAR             1
## BEN             1
## BER             3
## BRE             3
## CAM             1
## CES             2
## CHA             1
## CRO             4
## DES             2
## FRS             1
## GES             2
## GRA             8
## GRV             2
## HAI             1
## HUN             1
## ITR             1
## KAG             8
## KAN             5
## KER             1
## KRA             1
## MAL             2
## MAT             1
## OKI             1
## POP             1
## QNC             1
## RAR             2
## REC             1
## ROS             5
## SEV             3
## SIC             1
## SLO             5
## SOC             5
## SSK             1
## STS             3
## TIK             3
## TRE             1
## TUA             1
## TUH             1
## UTS            61

The half distance of the maximum r^2 for all populations vary from 1 to 8kb* when we bin the data. We do see larger blocks but most of the ld blocks are small. You can use the entire genome to estimate LD instead of the larger scaffolds, but the result will be almost the same. I have done it, but I included only the larger scaffolds to make it easier for anyone to repeat the analysis. Once we start using the entire genome, we need to set a LD window, otherwise we will have billions of estimates since we would have to estimate r^2 for all pairs of SNPs on each chromosome. Let’s say we would have 500 SNPs on chromosome 1, we would have 2^500, which is 3.273391e+150 pairwise estimates.

Make a plot for all populations

# Calculate half distances for each population
populations <- names(my_ld_tibbles)
half_distances <- sapply(populations, function(population) {
  ld_tibble <- my_ld_tibbles[[population]]
  half_pop <- get_half_pop(ld_tibble)
  return(half_pop)
})

# Create a tibble to store the half distances
half_distance_tibble <- data.frame(
  population = populations,
  half_distance = half_distances
)

# set the order of populations, you can change it on Excel then use it as a vector name pop_order
pop_order <-
  c(
    unique(
      half_distance_tibble$population
    )
  )

# set the order by continent (Africa, Americas, Asia, Europe) - check Excel spreadsheet named Pops_ld_order.xlxs or make your own. For example, range or continent.
half_distance_tibble$population <-
  factor(half_distance_tibble$population, levels = pop_order)

# Plot the half distances for each population
ggplot(half_distance_tibble, aes(x = population, y = half_distance)) +
  geom_bar(
    stat = "identity",
    fill = "lightgray",
    alpha = 0.8
  ) +
  geom_text(
    aes(label = round(half_distance, 2)),
    vjust = -0.5,
    size = 4,
    color = "red"
  ) +
  labs(x = "Population", y = expression(bold("Half Distance of Max " ~ r^
    2 ~ " (kb)"))) +
  my_theme() +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 14
    ),
    axis.text.y = element_text(size = 14),
    axis.title.x = element_text(size = 16, face = "bold"),
    axis.title.y = element_text(size = 16, face = "bold")
  )

# save the plot
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "figures", "linkage_half_distance_LONG_scaffolds_euro_global.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in"
)

We can also plot the distribution of LD blocks across all scaffolds. Later we will create a new chromosomal scale, but for now lets estimate LD blocks using the genome as it is.

3.4.2 Estimate LD blocks using scaffolds

mkdir -p euro_global/output/files/ld/blocks

We can use Plink1.9 to estimate LD blocks for the populations with more than 12 individuals. We will use the entire genome for this part instead of the larger scaffolds only. We will set max distance of LD blocks of 200kb. We found out that the average half distance of r^2 max is relatively small

for fam in $(awk '{print $1}' euro_global/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile euro_global/output/file4 \
--keep-fam /dev/stdin \
--maf 0.1 \
--blocks no-pheno-req \
--blocks-max-kb 200 \
--out euro_global/output/files/ld/blocks/$fam \
--geno 0.1 \
--silent
done;
#
rm euro_global/output/files/ld/blocks/*.nosex

Now we can get some data from our .log files

echo "Population,n,nVariants,geno,maf,passQC" > euro_global/output/files/ld/blocks/table_ld_stats.csv
for file in euro_global/output/files/ld/blocks/*.log
do
  variants=$(grep -oE '([0-9]+) variants loaded from \.bim file' $file | grep -oE '[0-9]+')
  geno=$(grep -oE '([0-9]+) variants removed due to missing genotype data \(--geno\)' $file | grep -oE '[0-9]+')
  maf=$(grep -oE '([0-9]+) variants removed due to minor allele threshold\(s\)' $file | grep -oE '[0-9]+')
  pass=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | head -1)
  n=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | tail -1)
  filename=$(basename $file .log)
  echo "$filename,$n,$variants,$geno,$maf,$pass" >> euro_global/output/files/ld/blocks/table_ld_stats.csv
done;
head -n 5 euro_global/output/files/ld/blocks/table_ld_stats.csv

## Population,n,nVariants,geno,maf,passQC
## ALU,12,87183,5005,23240,58938
## ALV,12,87183,4439,21462,61282
## BAR,12,87183,5935,24298,56950
## BEN,12,87183,6361,31062,49760

We can check the it out

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "euro_global/output/files/ld/blocks/table_ld_stats.csv"
    ),
  header = TRUE,
  sep = ","
)

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 1: Summary of quality control for population data.")

ft

Table 1: Summary of quality control for population data.
Population	n	nVariants	geno	maf	passQC
ALU	12	87,183	5,005	23,240	58,938
ALV	12	87,183	4,439	21,462	61,282
BAR	12	87,183	5,935	24,298	56,950
BEN	12	87,183	6,361	31,062	49,760
BER	12	87,183	5,104	18,622	63,457
BRE	13	87,183	10,298	22,602	54,283
CAM	12	87,183	7,616	26,142	53,425
CES	14	87,183	15,824	24,485	46,874
CHA	12	87,183	4,781	29,232	53,170
CRO	12	87,183	4,642	21,290	61,251
DES	17	87,183	18,316	19,093	49,774
FRS	12	87,183	3,649	18,716	64,818
GES	12	87,183	4,226	25,586	57,371
GRA	12	87,183	5,390	19,319	62,474
GRV	12	87,183	5,810	26,958	54,415
HAI	12	87,183	4,275	19,967	62,941
HUN	12	87,183	3,875	17,296	66,012
ITR	12	87,183	3,606	19,021	64,556
KAG	12	87,183	5,168	22,008	60,007
KAN	12	87,183	5,191	27,265	54,727
KER	12	87,183	3,940	22,189	61,054
KRA	12	87,183	6,336	21,330	59,517
MAL	12	87,183	3,844	18,044	65,295
MAT	12	87,183	5,074	29,941	52,168
OKI	12	87,183	5,933	25,350	55,900
POP	12	87,183	3,925	16,703	66,555
QNC	12	87,183	6,640	35,691	44,852
RAR	12	87,183	4,111	26,637	56,435
REC	12	87,183	9,288	22,616	55,279
ROS	12	87,183	6,222	22,260	58,701
SEV	12	87,183	4,472	27,759	54,952
SIC	12	87,183	8,942	22,788	55,453
SLO	12	87,183	7,077	16,189	63,917
SOC	12	87,183	4,258	25,187	57,738
SSK	12	87,183	4,939	29,439	52,805
STS	12	87,183	4,558	18,918	63,707
TIK	12	87,183	4,234	27,668	55,281
TRE	12	87,183	3,667	15,455	68,061
TUA	12	87,183	5,116	21,645	60,422
TUH	12	87,183	4,677	19,546	62,960
UTS	12	87,183	4,226	21,764	61,193

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("scripts", "RMarkdowns", "output", "euro_global", "summary_maf_geno_filter_population_euro_global.docx"))

cd euro_global/output/files/ld/blocks
wc -l *.blocks | \
  awk '{population = gensub(/\.blocks/, "", "g", $2); print population "\t" $1}' | \
  sed 's#/##' | \
  sed '$d' > populations_block_counts.csv;
head -n 30 populations_block_counts.csv

## ALU  83
## ALV  99
## BAR  114
## BEN  8
## BER  105
## BRE  307
## CAM  8
## CES  421
## CHA  7
## CRO  107
## DES  545
## FRS  68
## GES  186
## GRA  35
## GRV  170
## HAI  62
## HUN  48
## ITR  88
## KAG  127
## KAN  318
## KER  135
## KRA  98
## MAL  79
## MAT  13
## OKI  142
## POP  81
## QNC  36
## RAR  161
## REC  120
## ROS  86

Now we can add the number of blocks to the table we made

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
    "euro_global/output/files/ld/blocks/table_ld_stats.csv"
    ,
  header = TRUE,
  sep = ","
)

# Load the population counts data from the CSV file
pop_counts <-
  read.delim(
      "euro_global/output/files/ld/blocks/populations_block_counts.csv"
    ,
    header = F,
    sep = "\t"
  ) |>
  rename(
    Population       = 1,
    nBlocks           = 2
  )

# Merge the population counts with the table data
ld_blocks <- merge(ld_blocks, pop_counts, by = "Population")

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 2: Number of linkage blocks detected with Plink for populations with at least 12 individuals.")

ft

Table 2: Number of linkage blocks detected with Plink for populations with at least 12 individuals.
Population	n	nVariants	geno	maf	passQC	nBlocks
ALU	12	87,183	5,005	23,240	58,938	83
ALV	12	87,183	4,439	21,462	61,282	99
BAR	12	87,183	5,935	24,298	56,950	114
BEN	12	87,183	6,361	31,062	49,760	8
BER	12	87,183	5,104	18,622	63,457	105
BRE	13	87,183	10,298	22,602	54,283	307
CAM	12	87,183	7,616	26,142	53,425	8
CES	14	87,183	15,824	24,485	46,874	421
CHA	12	87,183	4,781	29,232	53,170	7
CRO	12	87,183	4,642	21,290	61,251	107
DES	17	87,183	18,316	19,093	49,774	545
FRS	12	87,183	3,649	18,716	64,818	68
GES	12	87,183	4,226	25,586	57,371	186
GRA	12	87,183	5,390	19,319	62,474	35
GRV	12	87,183	5,810	26,958	54,415	170
HAI	12	87,183	4,275	19,967	62,941	62
HUN	12	87,183	3,875	17,296	66,012	48
ITR	12	87,183	3,606	19,021	64,556	88
KAG	12	87,183	5,168	22,008	60,007	127
KAN	12	87,183	5,191	27,265	54,727	318
KER	12	87,183	3,940	22,189	61,054	135
KRA	12	87,183	6,336	21,330	59,517	98
MAL	12	87,183	3,844	18,044	65,295	79
MAT	12	87,183	5,074	29,941	52,168	13
OKI	12	87,183	5,933	25,350	55,900	142
POP	12	87,183	3,925	16,703	66,555	81
QNC	12	87,183	6,640	35,691	44,852	36
RAR	12	87,183	4,111	26,637	56,435	161
REC	12	87,183	9,288	22,616	55,279	120
ROS	12	87,183	6,222	22,260	58,701	86
SEV	12	87,183	4,472	27,759	54,952	217
SIC	12	87,183	8,942	22,788	55,453	165
SLO	12	87,183	7,077	16,189	63,917	43
SOC	12	87,183	4,258	25,187	57,738	139
SSK	12	87,183	4,939	29,439	52,805	8
STS	12	87,183	4,558	18,918	63,707	121
TIK	12	87,183	4,234	27,668	55,281	203
TRE	12	87,183	3,667	15,455	68,061	75
TUA	12	87,183	5,116	21,645	60,422	137
TUH	12	87,183	4,677	19,546	62,960	125
UTS	12	87,183	4,226	21,764	61,193	143

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("scripts", "RMarkdowns", "output", "euro_global", "summary_ld_blocks_euro_global.docx"))

Get the size of each block from the .block.det files

get_kb_column <- function(dir_path) {
  # obtain the list of files with extension .blocks.det
  file_names <- list.files(path = dir_path, pattern = "\\.blocks\\.det$", full.names = TRUE)
  
  # create an empty list to hold the data frames
  block_list <- list()
  
  # loop through the files and read the data into the list
  for (file in file_names) {
    df <- read.table(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
    # select only the KB column and add it to the block_list with the file name
    block_list[[file]] <- df %>% select(KB) %>% add_column(file = file, .before = 1)
  }
  
  # combine the data frames in the block_list into a single data frame
  blocks <- bind_rows(block_list)
  
  # clean up the file name column
  blocks$file <- str_remove(blocks$file, "^.*\\/ld\\/blocks\\/")
  
  return(blocks)
}

# example usage: replace dir_path with your directory path
dir_path <- here("euro_global/output/files/ld/blocks")

blocks<- 
  get_kb_column(dir_path) |>
  mutate(file = str_remove(file, ".blocks.det")) |>
  as_tibble() |>
  rename(
    Population = 1
  )

Create density plot of the size of the LD blocks Plink found

# define the color palette with 38 color blind colors
library(RColorBrewer)
n <- 50
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))

col_vector

##  [1] "#7FC97F" "#BEAED4" "#FDC086" "#FFFF99" "#386CB0" "#F0027F" "#BF5B17"
##  [8] "#666666" "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02"
## [15] "#A6761D" "#666666" "#A6CEE3" "#1F78B4" "#B2DF8A" "#33A02C" "#FB9A99"
## [22] "#E31A1C" "#FDBF6F" "#FF7F00" "#CAB2D6" "#6A3D9A" "#FFFF99" "#B15928"
## [29] "#FBB4AE" "#B3CDE3" "#CCEBC5" "#DECBE4" "#FED9A6" "#FFFFCC" "#E5D8BD"
## [36] "#FDDAEC" "#F2F2F2" "#B3E2CD" "#FDCDAC" "#CBD5E8" "#F4CAE4" "#E6F5C9"
## [43] "#FFF2AE" "#F1E2CC" "#CCCCCC" "#E41A1C" "#377EB8" "#4DAF4A" "#984EA3"
## [50] "#FF7F00" "#FFFF33" "#A65628" "#F781BF" "#999999" "#66C2A5" "#FC8D62"
## [57] "#8DA0CB" "#E78AC3" "#A6D854" "#FFD92F" "#E5C494" "#B3B3B3" "#8DD3C7"
## [64] "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69" "#FCCDE5"
## [71] "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"

# make plot using the sample y scale for all populations
ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3) + my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("scripts",
    "RMarkdowns",
    "output",
    "euro_global",
    "figures",
    "block_density_y_scale_fixed_euro_global.pdf"
  ),
  width = 10,
  height = 10,
  units = "in"
)

Make plot allowing the y axis scale free

ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3, scales = "free_y") +
  my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("scripts",
    "RMarkdowns",
    "output",
    "euro_global",
    "figures",
    "block_density_free_scale_euro_global.pdf"
  ),
  width = 20,
  height = 18,
  units = "in"
)

3.5 LD pruning

Since we do not have to remove any SNPs due to deviation from HWE, we can proceed with heterozygosity estimates. The first step is to “prune” our data set. We will check the pairwise linkage estimates for all SNPs. We can work with file4. We will use “indep-pairwise” to check if there are SNPs above a certain linkage disequilibrium (LD) threshold. Check Plink documentation for more details https://www.cog-genomics.org/plink/1.9/ld I used “--indep-pairwise 5 1 0.1” , which indicates according to the documentation: --indep-pairphase <window size>['kb'] <step size (variant ct)> <r^2 threshold> We will check in a window of 5kb if there is any pair of SNPs with r2 estimates above 0.1, then we will move our window 1 SNP and check again for SNPs above the threshold. We will repeat this procedure until we check the entire genome.

Need to switch back to plink2

module load PLINK/2_avx2_20221024
#PLINK/1.9b_6.21-x86_64 => PLINK/2_avx2_20221024
plink2 --version

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file4 \
--extract euro_global/output/files/passed_hwe.txt \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP.log

–indep-pairwise 5 1 0.1 708 samples (85 females, 70 males, 553 ambiguous; 708 founders) loaded from 87183 variants loaded from euro_global/output/file4.bim. –extract: 87183 variants remaining. 87183 variants remaining after main filters. –indep-pairwise (28 compute threads): 30726/87183 variants removed.

Remember, the SNPs are not removed from our data set. Plink created 3 files when we ran the code above. One is the “indepSNP.log” file, and the other two are: “indepSNP.prune.in” -> list of SNPs with squared correlation smaller than our r2 threshold of 0.1. “indepSNP.prune.out” -> list of SNPs with squared correlation greater than our r2 threshold of 0.1. For our heterozygosity estimates, we want to use the set of SNPs that are below our r2 threshold of 0.1. We consider that they are randomly associated. We can use Plink to estimate the heterozygosity using the “indepSNP.prune.in” file.

3.6 Heterozygosity

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file4 \
--extract euro_global/output/indepSNP.prune.in \
--het \
--out euro_global/output/R_check \
--silent;
grep 'variants' euro_global/output/R_check.log

87183 variants loaded from euro_global/output/file4.bim. –extract: 56457 variants remaining. 56457 variants remaining after main filters.

#   find individuals with high heterozygosity                              ####
# import the data from Plink
het <- read.delim(
  here(
    "euro_global/output/R_check.het"
  ),
  head = TRUE
)
#
# check head of the file
colnames(het)

## [1] "X.FID"  "IID"    "O.HOM." "E.HOM." "OBS_CT" "F"

Estimate heterozygosity

# create a column named HET_RATE and calculate the heterozygosity rate
het$HET_RATE <- (het$"OBS_CT" - het$"O.HOM") / het$"OBS_CT"
#
# use subset function to get values deviating from 4sd of the mean heterozygosity rate.
het_fail <-
  subset(
    het, (het$HET_RATE < mean(
      het$HET_RATE
    ) - 4 * sd(
      het$HET_RATE
    )) |
      (het$HET_RATE > mean(
        het$HET_RATE
      ) + 4 * sd(
        het$HET_RATE
      ))
  )
#
# get the list of individuals that failed our threshold of 4sd from the mean.
het_fail$HET_DST <-
  (het_fail$HET_RATE - mean(
    het$HET_RATE
  )) / sd(
    het$HET_RATE
  )

Save the files to use with Plink

#   save the data to use with Plink2                                        ####
#
write.table(
  het_fail,
  here(
    "euro_global/output/fail-het-qc.txt"
  ),
  row.names = FALSE
)

Make plot

#   make a heterozygosity plot                                              ####
#
ggplot(
  het,
  aes(
    HET_RATE
  )
) +
  geom_histogram(
    colour           = "black",
    fill             = "lightgray",
    bins             = 40
  ) +
  labs(
    x                = "Heterozygosity Rate",
    y                = "Number of Individuals"
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      )
    ),
    col              = "red",
    linewidth        = 1.5
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) + 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) - 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) + 
  my_theme() +
  scale_y_continuous(
  )

#   save the heterozygosity plot                                            ####
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "figures", "Heterozygosity_euro_global.pdf"
  ),
  width  = 5,
  height = 4,
  units  = "in"
)

The red line in the plot above indicates the mean, and the gray lines indicate 4 standard deviation from the mean. We can see that some mosquitoes do have excess heterozygous sites. We will remove them. We can get their ID from the file “fail-het-qc.txt”. We can use the bash script below to parse the file to use with Plink

sed 's/"// g' euro_global/output/fail-het-qc.txt | awk '{print$1, $2}'> euro_global/output/het_fail_ind.txt;
echo 'How many mosquitoes we need to remove from our data set:';
cat euro_global/output/het_fail_ind.txt | tail -n +2 | wc -l;
echo 'Which mosquitoes we have to remove:';
tail -n +2 euro_global/output/het_fail_ind.txt

## How many mosquitoes we need to remove from our data set:
## 9
## Which mosquitoes we have to remove:
## QNC 1248
## KAT 601
## KAT 605
## KAT 606
## KAT 608
## GRA 734
## TUA 783
## ITP 832
## ROS 858

Heterozygosity is outside the SD threshold in 4 KAT individuals (601, 605, 606, 608), 1 QNC (1248), 1 COL (471), 1 GRA (734), 1 TUA (783), 1 ITP (832), and 1 ROS (858). We will remove these 10 individuals.

Next, we will remove these mosquitoes from our data set using Plink:

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file4 \
--remove euro_global/output/het_fail_ind.txt \
--make-bed \
--out euro_global/output/file5 \
--silent;
grep 'variants\|samples' euro_global/output/file5.log

708 samples (85 females, 70 males, 553 ambiguous; 708 founders) loaded from 87183 variants loaded from euro_global/output/file4.bim. –remove: 699 samples remaining. 699 samples (85 females, 65 males, 549 ambiguous; 699 founders) remaining after

3.7 Relatedness

Check for cryptic relatedness. Check Plink2 documentation https://www.cog-genomics.org/plink/2.0/distance You can download King directly https://www.kingrelatedness.com/manual.shtml or check their manuscript https://www.ncbi.nlm.populations.gov/pmc/articles/PMC3025716/pdf/btq559.pdf From Plink2 documentation: “Note that KING kinship coefficients are scaled such that duplicate samples have kinship 0.5, not 1. First-degree relations (parent-child, full siblings) correspond to ~0.25, second-degree relations correspond to ~0.125, etc. It is conventional to use a cutoff of ~0.354 (the geometric mean of 0.5 and 0.25) to screen for monozygotic twins and duplicate samples, ~0.177 to add first-degree relations, etc.” There two options. One is to run only –make-king and another one is to use –make-king-table We will use the threshold of 0.354 and create a table.

# Plink2 will create a file with extension .king
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5 \
--extract euro_global/output/indepSNP.prune.in \
--make-king-table rel-check \
--king-table-filter 0.354 \
--out euro_global/output/file5 \
--silent;
grep 'variants\|samples' euro_global/output/file5.log

Check the individuals that did not pass our filtering.

head -n 14 euro_global/output/file5.kin0

## #FID1    IID1    FID2    IID2    NSNP    HETHET  IBS0    KINSHIP
## DES  1445    DES 1292    54548   0.26023 0.00419814  0.448376
## ITB  748 ITB 747 54825   0.2599  0.0102508   0.408236
## KAN  1327    KAN 1325    54735   0.232849    0.00906184  0.390812
## REC  179 REC 175 53464   0.235691    0.00809891  0.395171
## SIC  1231    SIC 1229    52790   0.262095    0.00310665  0.455326
## SIC  1235    SIC 1233    53131   0.248951    0.00732153  0.406379
## SIC  1235    SIC 1234    52529   0.230044    0.0100135   0.365304
## SIC  1236    SIC 1234    52812   0.237806    0.00967583  0.381678
## SPM  767 SPM 765 53015   0.241554    0.0053947   0.359256
## SPS  915 SPS 914 54642   0.216372    0.0105962   0.363793
## TUA  779 TUA 776 54290   0.241131    0.0124332   0.382112
## TUA  780 TUA 777 54523   0.244722    0.00959228  0.392349

We want to remove one of the individuals of the pairs.

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5 \
--extract euro_global/output/indepSNP.prune.in \
--make-king triangle bin \
--out euro_global/output/file6 \
--silent;
grep 'variants\|samples' euro_global/output/file6.log

699 samples (85 females, 65 males, 549 ambiguous; 699 founders) loaded from 87183 variants loaded from euro_global/output/file5.bim. –extract: 56457 variants remaining. 56457 variants remaining after main filters. –make-king pass 1/1: Scanning for rare variants… done. 0 variants handled by initial scan (56457 remaining). –make-king: 56457 variants processed.

Now we can use Plink2 to remove one of the mosquitoes from the pair with high kinship. It will remove 10 samples since we had 2 samples in some populations with high relatedness, and we could remove 1 and keep the other two. Plink2 always tries to maximize the number of samples passing the filters.

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5 \
--king-cutoff euro_global/output/file6 0.354 \
--make-bed \
--out euro_global/output/file7 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/file7.log

699 samples (85 females, 65 males, 549 ambiguous; 699 founders) loaded from 87183 variants loaded from euro_global/output/file5.bim. euro_global/output/file7.king.cutoff.out.id , and 688 remaining sample IDs

4. Create chromosomal scale

Import the .bim file with the SNPs to create a new chromosomal scale.

#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "euro_global/output/file7.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)

## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-583035163     0   315386 A       G      
## 2 1        AX-583033356     0   315674 C       T      
## 3 1        AX-583033370     0   330057 G       A      
## 4 1        AX-583035194     0   330265 A       G      
## 5 1        AX-583033387     0   331288 C       T      
## 6 1        AX-583035257     0   442875 T       C

Separate the tibbles into each chromosome

#   separate the SNP data per chromosome                                    
# chr1
chr1_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^1."
    )
  ) |> # here we get only Scaffold rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^2."
    )
  ) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^3."
    )
  ) |>
  as_tibble()

Import the file with sizes of each scaffold.

#   import the file with the scaffold sizes                                 ####
sizes <-
  read_delim(
    here(
      "/gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cd"
  )
#
# set column names
colnames(
  sizes
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <- 
  sizes |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  ) |>
  arrange(
    Scaffold
  )                   # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)

## # A tibble: 6 × 3
##   Scaffold     Size Chromosome
##   <chr>       <dbl> <chr>     
## 1 1.1        351198 1         
## 2 1.10     11939576 1         
## 3 1.100     3389100 1         
## 4 1.101      470438 1         
## 5 1.102     2525157 1         
## 6 1.103      150026 1

Create new scale. Get the scaffolds for each chromosome.

#   separate the scaffold sizes tibble per chromosome                       ####
# chr1
chr1_scaffolds <- 
  sizes |>
  filter(
    str_detect(
      Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
    )
  ) |> 
  as_tibble()
#
# chr2
chr2_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
    )
  ) |> 
  as_tibble()
#
# # chr3
chr3_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
    )
  ) |>
  as_tibble()

Create a scale for each chromosome.

#   create a new scale for each chromosome                                  ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
  0                                                                        # we create a new column with zeros
for (i in 2:nrow(
  chr1_scaffolds
)
) {                                                                        # loop to start on second line
  chr1_scaffolds$overall_size_before_bp[i] <-                              # set position on the scale
    chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
      1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr2_scaffolds
)
) {
  chr2_scaffolds$overall_size_before_bp[i] <-
    chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
      1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr3_scaffolds
)
) {
  chr3_scaffolds$overall_size_before_bp[i] <-
    chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
      1]
}

Merge the data frames scaffolds and SNPs.

#   merge the data sets using the tidyverse function left_join              ####
# chr1
chr1_scale <-
  chr1_snps |>          # create data frame for each chromosome, get chr1_snps
  left_join(            # use lef_join function to merge it with chr1_scaffolds
    chr1_scaffolds,
    by = "Scaffold"
  ) |>                  # set column to use for merging (Scaffold in this case)
  na.omit() |>          # remove NAs, we don't have SNPs in every scaffold
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +                 # make new columns numeric
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr2
chr2_scale <-
  chr2_snps |>
  left_join(
    chr2_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr3
chr3_scale <-
  chr3_snps |>
  left_join(
    chr3_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )

Merge all chromosome scales.

#   merge the data sets, and select only the columns we need                ####
chroms <- rbind(
  chr1_scale, chr2_scale, chr3_scale
) |>
  dplyr::select(
    Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
  )
# check it
head(chroms)

## # A tibble: 0 × 6
## # ℹ 6 variables: Chromosome <chr>, SNP <chr>, Cm <int>, midPos_fullseq <dbl>,
## #   Allele1 <chr>, Allele2 <chr>

Save the new .bim file

#   ____________________________________________________________________________
#   save the new bim file with a new name, I added "B"                      ####
write.table(
  chroms,
  file      = here(
    "euro_global/output/file7C.bim"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Rename the .bim files

# change the name of the first .bim file, for example, append _backup.bim, and then replace the original file
mv euro_global/output/file7.bim euro_global/output/file7_backup.bim;
# than change the new bim we create to the original name (do it only once, otherwise it will mess up)
mv euro_global/output/file7C.bim euro_global/output/file7.bim

Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.

plink2 \
--bfile euro_global/output/file7 \
--make-bed \
--out euro_global/output/test01;
# then we remove the files 
rm euro_global/output/test01.*

PLINK v2.00a3.7LM AVX2 Intel (24 Oct 2022) www.cog-genomics.org/plink/2.0/ (C) 2005-2022 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to euro_global/output/test01.log. Options in effect: –bfile euro_global/output/file7 –make-bed –out euro_global/output/test01 Start time: Mon Dec 4 10:44:22 2023 257256 MiB RAM detected; reserving 128628 MiB for main workspace. Using up to 32 threads (change this with –threads). 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from euro_global/output/file7.fam. 87183 variants loaded from euro_global/output/file7.bim. Note: No phenotype data present. Writing euro_global/output/test01.fam … done. Writing euro_global/output/test01.bim … done. Writing euro_global/output/test01.bed … done. End time: Mon Dec 4 10:44:23 2023

4.1 Estimate LD blocks using chromosomal scale

mkdir -p euro_global/output/files/ld/blocks_chr

module load PLINK/1.9b_6.21-x86_64

We can use Plink1.9 to estimate LD blocks for the populations with more than 12 individuals. We will use the entire genome for this part instead of the larger scaffolds only. We will set max distance of LD blocks of 500kb. We found out that the average half distance of r^2 max is small, from 1 to 5kb

for fam in $(awk '{print $1}' euro_global/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile euro_global/output/file7 \
--keep-fam /dev/stdin \
--maf 0.1 \
--blocks no-pheno-req \
--blocks-max-kb 200 \
--out euro_global/output/files/ld/blocks_chr/$fam \
--geno 0.1 \
--silent
done;
#
rm euro_global/output/files/ld/blocks_chr/*.nosex

Now we can get some data from our .log files

echo "Population,n,nVariants,geno,maf,passQC" > euro_global/output/files/ld/blocks_chr/table_ld_stats.csv
for file in euro_global/output/files/ld/blocks_chr/*.log
do
  variants=$(grep -oE '([0-9]+) variants loaded from \.bim file' $file | grep -oE '[0-9]+')
  geno=$(grep -oE '([0-9]+) variants removed due to missing genotype data \(--geno\)' $file | grep -oE '[0-9]+')
  maf=$(grep -oE '([0-9]+) variants removed due to minor allele threshold\(s\)' $file | grep -oE '[0-9]+')
  pass=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | head -1)
  n=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | tail -1)
  filename=$(basename $file .log)
  echo "$filename,$n,$variants,$geno,$maf,$pass" >> euro_global/output/files/ld/blocks_chr/table_ld_stats.csv
done;
head -n 5 euro_global/output/files/ld/blocks_chr/table_ld_stats.csv

## Population,n,nVariants,geno,maf,passQC
## ALU,12,87183,5005,23240,58938
## ALV,12,87183,4439,21462,61282
## BAR,12,87183,5935,24298,56950
## BEN,12,87183,6361,31062,49760

Check it

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "euro_global/output/files/ld/blocks_chr/table_ld_stats.csv"
    ),
  header = TRUE,
  sep = ","
)

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 1: Summary of quality control for Euro_global dataset.")

ft

Table 1: Summary of quality control for Euro_global dataset.
Population	n	nVariants	geno	maf	passQC
ALU	12	87,183	5,005	23,240	58,938
ALV	12	87,183	4,439	21,462	61,282
BAR	12	87,183	5,935	24,298	56,950
BEN	12	87,183	6,361	31,062	49,760
BER	12	87,183	5,104	18,622	63,457
BRE	13	87,183	10,298	22,602	54,283
CAM	12	87,183	7,616	26,142	53,425
CES	14	87,183	15,824	24,485	46,874
CHA	12	87,183	4,781	29,232	53,170
CRO	12	87,183	4,642	21,290	61,251
DES	16	87,183	17,861	19,029	50,293
FRS	12	87,183	3,649	18,716	64,818
GES	12	87,183	4,226	25,586	57,371
GRA	11	87,183	4,402	20,482	62,299
GRV	12	87,183	5,810	26,958	54,415
HAI	12	87,183	4,275	19,967	62,941
HUN	12	87,183	3,875	17,296	66,012
ITR	12	87,183	3,606	19,021	64,556
KAG	12	87,183	5,168	22,008	60,007
KAN	11	87,183	4,630	28,705	53,848
KER	12	87,183	3,940	22,189	61,054
KRA	12	87,183	6,336	21,330	59,517
MAL	12	87,183	3,844	18,044	65,295
MAT	12	87,183	5,074	29,941	52,168
OKI	12	87,183	5,933	25,350	55,900
POP	12	87,183	3,925	16,703	66,555
QNC	11	87,183	5,595	36,809	44,779
RAR	12	87,183	4,111	26,637	56,435
REC	11	87,183	8,469	23,358	55,356
ROS	11	87,183	5,105	23,286	58,792
SEV	12	87,183	4,472	27,759	54,952
SIC	9	87,183	19,392	14,832	52,959
SLO	12	87,183	7,077	16,189	63,917
SOC	12	87,183	4,258	25,187	57,738
SSK	12	87,183	4,939	29,439	52,805
STS	12	87,183	4,558	18,918	63,707
TIK	12	87,183	4,234	27,668	55,281
TRE	12	87,183	3,667	15,455	68,061
TUA	9	87,183	12,585	17,780	56,818
TUH	12	87,183	4,677	19,546	62,960
UTS	12	87,183	4,226	21,764	61,193

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("euro_global/output/summary_blocks_chr.docx"))

Count how many blocks we found in each population

cd euro_global/output/files/ld/blocks_chr
wc -l *.blocks | \
  awk '{population = gensub(/\.blocks_chr/, "", "g", $2); print population "\t" $1}' | \
  sed 's#euro_global/output/files/ld/blocks_chr/##' | \
  sed 's/.blocks//' | \
  sed '$d' > populations_block_counts.csv;
head -n 79 populations_block_counts.csv

## ALU  83
## ALV  99
## BAR  114
## BEN  8
## BER  105
## BRE  307
## CAM  8
## CES  421
## CHA  7
## CRO  107
## DES  413
## FRS  68
## GES  186
## GRA  31
## GRV  170
## HAI  62
## HUN  48
## ITR  88
## KAG  127
## KAN  240
## KER  135
## KRA  98
## MAL  79
## MAT  13
## OKI  142
## POP  81
## QNC  24
## RAR  161
## REC  85
## ROS  74
## SEV  217
## SIC  0
## SLO  43
## SOC  139
## SSK  8
## STS  121
## TIK  203
## TRE  75
## TUA  3
## TUH  125
## UTS  143

Add the number of blocks to the table we made

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "euro_global/output/files/ld/blocks_chr/table_ld_stats.csv"
    ),
  header = TRUE,
  sep = ","
)

# Load the population counts data from the CSV file
pop_counts <-
  read.delim(
    here(
      "euro_global/output/files/ld/blocks_chr/populations_block_counts.csv"
    ),
    header = F,
    sep = "\t"
  ) |>
  rename(
    Population       = 1,
    nBlocks          = 2
  )

# Merge the population counts with the table data
ld_blocks <- merge(ld_blocks, pop_counts, by = "Population")

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.")

ft

Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.
Population	n	nVariants	geno	maf	passQC	nBlocks
ALU	12	87,183	5,005	23,240	58,938	83
ALV	12	87,183	4,439	21,462	61,282	99
BAR	12	87,183	5,935	24,298	56,950	114
BEN	12	87,183	6,361	31,062	49,760	8
BER	12	87,183	5,104	18,622	63,457	105
BRE	13	87,183	10,298	22,602	54,283	307
CAM	12	87,183	7,616	26,142	53,425	8
CES	14	87,183	15,824	24,485	46,874	421
CHA	12	87,183	4,781	29,232	53,170	7
CRO	12	87,183	4,642	21,290	61,251	107
DES	16	87,183	17,861	19,029	50,293	413
FRS	12	87,183	3,649	18,716	64,818	68
GES	12	87,183	4,226	25,586	57,371	186
GRA	11	87,183	4,402	20,482	62,299	31
GRV	12	87,183	5,810	26,958	54,415	170
HAI	12	87,183	4,275	19,967	62,941	62
HUN	12	87,183	3,875	17,296	66,012	48
ITR	12	87,183	3,606	19,021	64,556	88
KAG	12	87,183	5,168	22,008	60,007	127
KAN	11	87,183	4,630	28,705	53,848	240
KER	12	87,183	3,940	22,189	61,054	135
KRA	12	87,183	6,336	21,330	59,517	98
MAL	12	87,183	3,844	18,044	65,295	79
MAT	12	87,183	5,074	29,941	52,168	13
OKI	12	87,183	5,933	25,350	55,900	142
POP	12	87,183	3,925	16,703	66,555	81
QNC	11	87,183	5,595	36,809	44,779	24
RAR	12	87,183	4,111	26,637	56,435	161
REC	11	87,183	8,469	23,358	55,356	85
ROS	11	87,183	5,105	23,286	58,792	74
SEV	12	87,183	4,472	27,759	54,952	217
SIC	9	87,183	19,392	14,832	52,959	0
SLO	12	87,183	7,077	16,189	63,917	43
SOC	12	87,183	4,258	25,187	57,738	139
SSK	12	87,183	4,939	29,439	52,805	8
STS	12	87,183	4,558	18,918	63,707	121
TIK	12	87,183	4,234	27,668	55,281	203
TRE	12	87,183	3,667	15,455	68,061	75
TUA	9	87,183	12,585	17,780	56,818	3
TUH	12	87,183	4,677	19,546	62,960	125
UTS	12	87,183	4,226	21,764	61,193	143

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("scripts/RMarkdowns/euro_global/output/summary_ld_blocks_chr.docx"))

Get the size of each block from the .block.det files

get_kb_column <- function(dir_path) {
  # obtain the list of files with extension .blocks.det
  file_names <- list.files(path = dir_path, pattern = "\\.blocks\\.det$", full.names = TRUE)
  
  # create an empty list to hold the data frames
  block_list <- list()
  
  # loop through the files and read the data into the list
  for (file in file_names) {
    df <- read.table(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
    # select only the KB column and add it to the block_list with the file name
    block_list[[file]] <- df %>% dplyr::select(KB) %>% add_column(file = file, .before = 1)
  }
  
  # combine the data frames in the block_list into a single data frame
  blocks <- bind_rows(block_list)
  
  # clean up the file name column
  blocks$file <- str_remove(blocks$file, "^.*\\/ld\\/blocks\\/")
  
  return(blocks)
}

# example usage: replace dir_path with your directory path
dir_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/files/ld/blocks_chr")

blocks<- 
  get_kb_column(dir_path) |>
  mutate(file = str_remove(file, "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/files/ld/blocks_chr/")) |>
  mutate(file = str_remove(file, ".blocks.det")) |>

  as_tibble() |>
  rename(
    Population = 1
  )

Create density plot of the size of the LD blocks Plink found

# to check how many colors we need
# n_distinct(blocks$Population) #24

source(
  here("scripts", "RMarkdowns",
    "my_theme2.R"
  )
)

n <- 50
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))

col_vector

##  [1] "#7FC97F" "#BEAED4" "#FDC086" "#FFFF99" "#386CB0" "#F0027F" "#BF5B17"
##  [8] "#666666" "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02"
## [15] "#A6761D" "#666666" "#A6CEE3" "#1F78B4" "#B2DF8A" "#33A02C" "#FB9A99"
## [22] "#E31A1C" "#FDBF6F" "#FF7F00" "#CAB2D6" "#6A3D9A" "#FFFF99" "#B15928"
## [29] "#FBB4AE" "#B3CDE3" "#CCEBC5" "#DECBE4" "#FED9A6" "#FFFFCC" "#E5D8BD"
## [36] "#FDDAEC" "#F2F2F2" "#B3E2CD" "#FDCDAC" "#CBD5E8" "#F4CAE4" "#E6F5C9"
## [43] "#FFF2AE" "#F1E2CC" "#CCCCCC" "#E41A1C" "#377EB8" "#4DAF4A" "#984EA3"
## [50] "#FF7F00" "#FFFF33" "#A65628" "#F781BF" "#999999" "#66C2A5" "#FC8D62"
## [57] "#8DA0CB" "#E78AC3" "#A6D854" "#FFD92F" "#E5C494" "#B3B3B3" "#8DD3C7"
## [64] "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69" "#FCCDE5"
## [71] "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"

# make plot using the sample y scale for all populations
ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3) + my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("scripts", "RMarkdowns",
    "output",
    "euro_global",
    "figures",
    "block_density_y_scale_fixed_chr_euro_global.pdf"
  ),
  width = 10,
  height = 10,
  units = "in"
)

Make plot allowing the y axis scale free

ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3, scales = "free_y") +
  my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("scripts", "RMarkdowns",
    "output",
    "euro_global",
    "figures",
    "block_density_free_scale_chr_euro_global.pdf"
  ),
  width = 20,
  height = 18,
  units = "in"
)

6. Plot SNP density

After quality control with approximately 85k SNPs

# load the function that we saved earlier
source(
  here("scripts", "RMarkdowns",
    "analyses", "import_bim.R"
  ),
  local = knitr::knit_global()
)
# import the file
snp_den_qc <- import_bim(
  here(
    "euro_global/output/file7.bim"
  )
)

Make plot of the SNP density

#   ____________________________________________________________________________
#   plot SNP density after QC                                               ####
snp_den_qc |>
  rename(
    Chromosome = 1
  ) |>
  mutate(
    Position           = as.numeric(
      Position
    )
  ) |>
  ggplot(
    aes(
      x                = Position
    ),
    label              = sprintf(
      "%0.2f",
      round(
        a,
        digits         = 0
      )
    )
  ) +
  geom_histogram(
    aes(
      y                = after_stat(
        count
      )
    ),
    binwidth           = 1e6
  ) +
  facet_wrap(
    vars(
      Chromosome
    ),
    scales             = "free_x"
  ) +
  labs(
    title              = "SNP Density after QC",
    x                  = expression(
      "Position in the genome (Mb)"
    ),
    y                  = expression(
      "Number of SNPs"
    )
  ) +
  scale_x_continuous(
    labels             = function(x) {
      format(
        x / 1e6,
        big.mark       = ",", 
        scientific     = FALSE
      )
    }
  ) +
  geom_density(
  aes(
    y = 1e6 * after_stat(count)
  ),
  color = "red",
  linewidth = .75,
  alpha = .4,
  fill = "pink"
  ) +
    theme(
    panel.grid.major   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.grid.minor   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.spacing      = unit(0.5, "lines"),
    strip.text         = element_text(
      face             = "bold", hjust = .5
    ),
    strip.background.x = element_rect(
      color            = "gray"
    )
  )

ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global","figures", "snp_density_after_qc_euro_global.pdf"
  ),
  width  = 10,
  height = 6,
  units  = "in"
)

SNPs per chromosome

# we can use dplyr "count" to get the number of SNPs for each chromosome
# lets get the data we need
snps_per_chrm <- 
  snp_den_qc |>
  count(
    Scaffold) |>
  rename(
    Chromosome = 1,
    "SNPs (N) " = 2
  )

# Create the flextable
ft <- flextable::flextable(snps_per_chrm)

# Apply zebra theme
ft <- flextable::theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "SNPs per chromosome after quality control")
ft

SNPs per chromosome after quality control
Chromosome	SNPs (N)
1	19,362
2	36,611
3	31,210

We can get the mean number of SNPs per chromosome for the entire genome

# we first use dplyr cut_width to get the number of SNPs per 1Mb window
albo_den <- 
  snp_den_qc |>
  dplyr::select(
    Scaffold, Position
  ) |>
  group_by(
    Scaffold,
    windows               = cut_width(
      Position,
      width               = 1e6,
      boundary            = 0
    )
  ) |>
  summarise(
    n                     = n(),
    .groups               = "keep"
  ) |>
  group_by(
    Scaffold
  ) |>
  summarise(
    mean                  = mean(n),
    n                     = n(),
    .groups               = "keep"
  ) |>
  rename(
    Chromosome            = 1,
    "SNPs per 1Mb window" = 2,
    "Number of windows"   = 3
  )
#
# check the results
snp_table <-
  flextable(
    albo_den
  )
snp_table <- colformat_double(
  x        = snp_table,
  big.mark = ",",
  digits   = 2,
  na_str   = "N/A"
)
snp_table

Chromosome	SNPs per 1Mb window	Number of windows
1	52.76	367
2	63.34	578
3	63.82	489

Merge objects

# we can merge the two data sets we created above into one table
after_qc <-
  snps_per_chrm |>
  left_join(
    albo_den,
    by = "Chromosome"
  )
snp_table2 <- flextable(
  after_qc)
snp_table2 <- colformat_double(
  x        = snp_table2,
  big.mark = ",",
  digits   = 2, 
  na_str   = "N/A"
  )
snp_table2

Chromosome	SNPs (N)	SNPs per 1Mb window	Number of windows
1	19,362	52.76	367
2	36,611	63.34	578
3	31,210	63.82	489

9. LD pruning using the chromosomal scale

# we set a window of variants of 5 and move the window 1 variant per time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. Try --indep-pairwise 200kb 1 0.1
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file7 \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP_chr \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_chr.log

–indep-pairwise 5 1 0.1 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from euro_global/output/file7.bim. –indep-pairwise (3 compute threads): 30799/87183 variants removed.

Lets do the scaffold again

First need to import same SNP list

Import the .bim file with the SNPs

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "euro_global/output/file7_backup.bim" #output/populations/file4.bim
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)

## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-583035163     0   315386 A       G      
## 2 1.1      AX-583033356     0   315674 C       T      
## 3 1.1      AX-583033370     0   330057 G       A      
## 4 1.1      AX-583035194     0   330265 A       G      
## 5 1.1      AX-583033387     0   331288 C       T      
## 6 1.10     AX-583035257     0    91677 T       C

write.table(
  snps,
  file      = here(
    "euro_global/output/snps_all_scaffolds_4ld.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

nrow(snps)
#87183

nrow(snps)

## [1] 87183

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5 \
--king-cutoff euro_global/output/file6 0.354 \
--make-bed \
--out euro_global/output/file11a \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/file11a.log

699 samples (85 females, 65 males, 549 ambiguous; 699 founders) loaded from 87183 variants loaded from euro_global/output/file5.bim. euro_global/output/file11a.king.cutoff.out.id , and 688 remaining sample IDs

# we set a window of variants of 5 and move the window 1 variant each time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. We used --indep-pairwise 5 1 0.1 before. We can use the same values from the mean half distance max r2
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file11a \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP_scaffolds \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_scaffolds.log

–indep-pairwise 5 1 0.1 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from euro_global/output/file11a.bim. –indep-pairwise (28 compute threads): 30787/87183 variants removed.

Now we can compare the two sets of SNPs using scaffolds or chromosomal scale

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr <-
  read_delim(
    here(
      "euro_global/output/indepSNP_chr.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_scaffolds <-
  read_delim(
    here(
      "euro_global/output/indepSNP_scaffolds.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_scaffolds <- unlist(prunout_scaffolds)
prunout_chr <- unlist(prunout_chr)


# Calculate shared values
prunout <- intersect(prunout_chr, prunout_scaffolds)

# Create Venn diagram
venn_data1 <- list(
  "Chromosomal" = prunout_chr,
  "Scaffolds" = prunout_scaffolds
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of genomic scales for linked SNPs") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("scripts", "RMarkdowns", "output", "euro_global", "figures", "SNPs_linked_comparison_euro_global.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

11. Pruning r2 for LD-pruned datasets

We already have a dataset made using the chromosomal scale that pruned at 0.1 (made in step 9). If needed, can create it again using the code below

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file7 \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP_chr \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_chr.log

The indepSNP_chr.prune.in file produced here = those SNPs < our 0.1 threshold, that we want to use

Now we need to make pruned dataset for 0.01.

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file7 \
--indep-pairwise 5 1 0.01 \
--out euro_global/output/indepSNP_chr_r01 \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_chr_r01.log

–indep-pairwise 5 1 0.01 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from euro_global/output/file7.bim. –indep-pairwise (3 compute threads): 67865/87183 variants removed.

We can use the SNP set the LD pruning we just did and extract them from file7 to get SNP Set 1

plink2 \
--keep-allele-order \
--bfile euro_global/output/file7 \
--make-bed \
--export vcf \
--out euro_global/output/snps_sets/r2_0.01 \
--extract euro_global/output/indepSNP_chr_r01.prune.in \
--silent
grep "variants\|samples" euro_global/output/snps_sets/r2_0.01.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from euro_global/output/file7.bim. –extract: 19318 variants remaining. 19318 variants remaining after main filters.

Repeat for 0.1 pruning (SNP Set 2)

plink2 \
--keep-allele-order \
--bfile euro_global/output/file7 \
--make-bed \
--export vcf \
--out euro_global/output/snps_sets/r2_0.1 \
--extract euro_global/output/indepSNP_chr.prune.in \
--silent
grep "variants\|samples" euro_global/output/snps_sets/r2_0.1.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from euro_global/output/file7.bim. –extract: 56384 variants remaining. 56384 variants remaining after main filters.

These snp set can now be used for subsequent pop gen analyses.

12. Run PCA with LD-pruned dataset (for SNP Set 2)

plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/r2_0.1 \
--pca allele-wts \
--freq \
--out euro_global/output/pca_pops_1 \
--silent;
grep 'samples\|variants' euro_global/output/pca_pops_1.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 56384 variants loaded from euro_global/output/snps_sets/r2_0.1.bim.

Check the files

head -n 2 euro_global/output/pca_pops_1.eigenvec

## #FID IID PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## OKI  1001    0.00349461  0.000118613 0.0208636   -0.00870585 0.00203497  -0.02869    -0.00142337 0.0155941   0.123031    0.0553878

head -n 10 euro_global/output/pca_pops_1.eigenval

## 39.5287
## 20.1036
## 13.397
## 10.4736
## 6.95265
## 6.3098
## 6.18528
## 5.69513
## 5.64839
## 5.33564

Import PCA data

# import the data from Plink
pca <- read.delim(
  here(
    "euro_global/output/pca_pops_1.eigenvec"
  ),
  head = TRUE
)
 
# check head of the file
head(pca)

##   X.FID  IID         PC1          PC2       PC3         PC4        PC5
## 1   OKI 1001 0.003494610  0.000118613 0.0208636 -0.00870585 0.00203497
## 2   OKI 1002 0.000384979 -0.000916926 0.0198715 -0.00580988 0.00924135
## 3   OKI 1003 0.001768030 -0.001275240 0.0250805 -0.00376816 0.00669142
## 4   OKI 1004 0.004099850  0.004415930 0.0185805 -0.00992829 0.01588560
## 5   OKI 1005 0.004148830  0.002128780 0.0199122 -0.01100970 0.01326390
## 6   OKI 1006 0.001563690  0.000750957 0.0224729 -0.00564903 0.00829270
##          PC6         PC7       PC8      PC9      PC10
## 1 -0.0286900 -0.00142337 0.0155941 0.123031 0.0553878
## 2 -0.0174144 -0.00895626 0.0118358 0.114927 0.0541798
## 3 -0.0235837 -0.00728948 0.0289258 0.188506 0.0892375
## 4 -0.0297502  0.00484661 0.0229789 0.152901 0.0754698
## 5 -0.0311584  0.00671398 0.0231348 0.166893 0.0794183
## 6 -0.0267463 -0.00465223 0.0309340 0.198834 0.1030400

Import sample attributes

# import sample attributes
samples2 <- read.delim(
  here("scripts", "RMarkdowns",
    "Population_meta_data.txt"
  ),
  head = TRUE
)
#
# check head of the file
head(samples2)

##   geo    range continent         region    country pop     pop_city
## 1   4 Invasive    Africa Central Africa      Gabon GAB  Franceville
## 2   5 Invasive    Africa    East Africa Madagascar ANT Antananarivo
## 3   5 Invasive    Africa    East Africa Madagascar MAD    Morondava
## 4   5 Invasive    Africa    East Africa Madagascar DGV  Diego ville
## 5   5 Invasive    Africa    East Africa Madagascar VOH     Vohimasy
## 6   6 Invasive    Africa   Indian Ocean  Mauritius DAU      Dauguet

Check number of samples per population

pops_pca <- 
  pca |>
  group_by(X.FID) |>
  summarize(count_distinct = n_distinct(IID))

# check it
head(pops_pca)

## # A tibble: 6 × 2
##   X.FID count_distinct
##   <chr>          <int>
## 1 ALD               10
## 2 ALU               12
## 3 ALV               12
## 4 ARM               10
## 5 BAR               12
## 6 BEN               12

Merge the data

df4 <-
  merge(
    pca,
    samples2,
    by.x = 1,
    by.y = 6,
    all.x = T,
    all.y = F
  ) |>
  na.omit()
head(df4)

##   X.FID IID       PC1        PC2       PC3        PC4       PC5        PC6
## 1   ALD 801 0.0159056 0.00857769 0.0621374 0.00954265 -0.133485 0.02282570
## 2   ALD 802 0.0141292 0.00749907 0.0672776 0.01092060 -0.153636 0.01851460
## 3   ALD 803 0.0145545 0.00648255 0.0789674 0.01383960 -0.119952 0.03130150
## 4   ALD 804 0.0143278 0.00543499 0.0712662 0.00800879 -0.136483 0.00989339
## 5   ALD 805 0.0145400 0.01064620 0.0567210 0.00721298 -0.123181 0.00497971
## 6   ALD 806 0.0120557 0.00572123 0.0706519 0.00906846 -0.141123 0.02840840
##         PC7        PC8        PC9      PC10 geo    range continent
## 1 0.0284547 0.02853680 -0.0258755 0.0168006  13 Invasive    Europe
## 2 0.0383160 0.03317460 -0.0340364 0.0166647  13 Invasive    Europe
## 3 0.0295233 0.04432740 -0.0409305 0.0239785  13 Invasive    Europe
## 4 0.0441333 0.01870210 -0.0312714 0.0125413  13 Invasive    Europe
## 5 0.0388875 0.00906818 -0.0175808 0.0115633  13 Invasive    Europe
## 6 0.0453232 0.02585260 -0.0192321 0.0184307  13 Invasive    Europe
##            region country pop_city
## 1 Southern Europe Albania   Durres
## 2 Southern Europe Albania   Durres
## 3 Southern Europe Albania   Durres
## 4 Southern Europe Albania   Durres
## 5 Southern Europe Albania   Durres
## 6 Southern Europe Albania   Durres

Get some data for the PCA plot

# How many populations
length(unique(df4$X.FID))

## [1] 73

# How many countries
length(unique(df4$country))

## [1] 32

# How many regions
length(unique(df4$region))

## [1] 8

# How many continents
length(unique(df4$continent))

## [1] 3

Check the coutries

unique(df4$country)

##  [1] "Albania"   "Ukraine"   "Armenia"   "Spain"     "India"     "USA"      
##  [7] "Italy"     "Bulgaria"  "Cambodia"  "Thailand"  "Croatia"   "France"   
## [13] "Bhutan"    "Georgia"   "Greece"    "Brazil"    "China"     "Vietnam"  
## [19] "Indonesia" "Sri Lanka" "Japan"     "Nepal"     "Malaysia"  "Russia"   
## [25] "Maldives"  "Malta"     "Portugal"  "Romania"   "Serbia"    "Slovenia" 
## [31] "Taiwan"    "Turkey"

Check the regions

unique(df4$region)

## [1] "Southern Europe" "Eastern Europe"  "South Asia"      "North America"  
## [5] "Southeast Asia"  "Western Europe"  "South America"   "East Asia"

Make plot1

# source the plotting function
source(here("scripts", "RMarkdowns", "my_theme2.R"))

# Shapes
N = 100
M = 1000
good.shapes = c(1:25, 33:127)

# Colors
palette1 <- brewer.pal(12, "Paired")
palette2 <- brewer.pal(11, "Set3")
palette23 <- c(palette1, palette2)
colors2 <- 
  c("#52ef99", "#146c45", "#75d5e1", "#FB8072", "#2c4a5e", "#6a8fe0", "#8c61cd", "#f365e7", "#871550", "#a113b2", "#BF5B17", "#1F78B4", "#cf749b", "#FF7F00","#2524f9", "#799d10",  "#984EA3", "#754819", "#fda547", "#a41415", "#fd5917", "#fd4e8b", "#ead624", "#6A3D9A", "#21a708", "#332288", "#51f310", "#9d8d88", "#66C2A5", "#E41A1C", "#E7297A", "magenta")


# Compute the count for each country
country_count <- df4 |>
  group_by(country) |>
  summarize(count = n())

# Merge the count back to the main data
df4 <- df4 |>
  left_join(country_count, by = "country")

# Create a custom label for the legend
df4$country_label <-
  paste(df4$country, " (", df4$count, ")", sep = "")

# Define the color and shape manually
#palette23 <- c(palette1, palette2)
colors <- setNames(colors2, unique(df4$country_label))
shapes <-
  setNames(good.shapes[c(1:25, 28:31, 55:57)], unique(df4$country_label))


# Compute the center of ellipses for each continent
ellipse_centers <- df4 |>
  group_by(region) |>
  summarise(PC1_center = mean(PC1), PC2_center = mean(PC2))

# # Calculate the number of samples per region
continent_count <- df4 |>
  group_by(region) |>
  summarise(count = n())
continent_labels <-
  setNames(
    paste(continent_count$region, " (", continent_count$count, ")", sep = ""),
    continent_count$region
  )

# Define the colors you want for each continent
#continent_colors <-
  #c("yellow", "pink", "#6a8fe0", "gold3", "#FF6F90", "lightpink3", "lightblue","blue") # Adjust these colors to your preference

continent_colors <-
  c("pink", "lightblue", "lightgreen", "lightgreen", "pink", "pink", "lightblue", "lightblue") # Adjust these colors to your preference

  
# Create the plot
ggplot(df4, aes(PC1, PC2)) +
  geom_point(aes(shape = country_label, color = country_label)) +
  stat_ellipse(
    aes(fill = region, group = region),
    geom = "polygon",
    alpha = 0.2,
    level = 0.8,
    segments = 40,
    color = "darkgray"
  ) +
  stat_ellipse(
    aes(group = region),
    geom = "path",
    level = 0.8,
    segments = 40,
    color = "darkgray"
  ) + # Line around the ellipse
  geom_text_repel(
    data = ellipse_centers,
    aes(x = PC1_center, y = PC2_center, label = region),
    color = "black"
  ) + # Add labels using ggrepel
  xlab("PC1 (39.5% Variance)") + 
  ylab("PC2 (20.1% Variance)") +
  labs(caption = "Principal Component Analysis with 56,384 SNPs \n of mosquitoes from 73 localities across 32 countries in Europe, Asia and the Americas.") + 
  guides(
    color = guide_legend(
      title = "Country",
      order = 2,
      ncol = 2
    ),
    shape = guide_legend(
      title = "Country",
      order = 2,
      ncol = 2
    ),
    fill = guide_legend(
      title = "Continent",
      order = 1,
      ncol = 1
    )
  ) +
  scale_fill_manual(values = continent_colors, labels = continent_labels) +
  scale_color_manual(values = colors) +
  scale_shape_manual(values = shapes) +
  guides(fill=guide_legend(ncol=2)) +
  my_theme() +
  theme(
    plot.caption = element_text(face = "italic"),
    legend.position = "right",
    legend.justification = "top",
    legend.box.just = "center",
    legend.box.background = element_blank(),
    legend.text=element_text(size=6),
    plot.margin = margin(5.5, 30, 5.5, 5.5, "points"),
    legend.margin = margin(5, 5, 5, 40) # move the legend a bit up
  )

#save the pca plot
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "figures", "PCA_euro_global_r1_plink.pdf"
  ),
  width  = 10,
  height = 7,
  units  = "in"
)

13. Make a new snp set with Minor Allele Frequency of 1% (MAF 0.01)

13.2 Minor allele frequency

First, we estimate the allele frequency with Plink.

cd /gpfs/gibbs/pi/caccone/mkc54/albo
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file3 \
--freq \
--out euro_global/output/MAF_check \
--silent

Now we apply the MAF filter for 0.01

# We will use MAF of 1%
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file3 \
--maf 0.01 \
--make-bed \
--out euro_global/output/file4b \
--silent;
grep "variants" euro_global/output/file4b.log

103823 variants loaded from euro_global/output/file3.bim. 3456 variants removed due to allele frequency threshold(s) 100367 variants remaining after main filters.

Then we plot it with ggplot.

#   Import MAF data                                                         ####
maf_freq <-
  read.delim(
    here(
      "euro_global/output/MAF_check.afreq"
    ),
    header = TRUE
  )

Make MAF plot

#   make the plot                                                           ####
ggplot(
  maf_freq,
  aes(ALT_FREQS)
) +
  geom_histogram(
    colour = "black",
    fill = "lightgray",
    bins = 40
  ) +
  labs(
    x = "Minor Allele Frequency (MAF)",
    y = "Frequency (n)",
    caption = "<span style='color:red;'><i>Red</i></span> <span style='color:black;'><i>line at</i></span><span style='color:red;'><i> MAF 1%</i></span><span style='color:black;'><i> threshold</i></span>."
  ) +
  geom_text(
    aes(
      x = .01,
      y = 8000,
      label = paste0("3,456 SNPs")
    ),
    size = 3,
    color = "red",
    vjust = -.2
  ) +
  geom_vline(xintercept = 0.01, color = "red") +
  my_theme() +
  theme(plot.caption = element_markdown()) +
  scale_y_continuous(label = scales::number_format(big.mark = ",")) +
  scale_x_continuous(breaks = c(0, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0))

## Warning in geom_text(aes(x = 0.01, y = 8000, label = paste0("3,456 SNPs")), : All aesthetics have length 1, but the data has 103823 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# save the plot
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "figures", "MAF_0.01_freq_plot_euro_global.pdf"
  ),
  width  = 7,
  height = 5,
  units  = "in"
)

We removed 3,456 variants due to the MAF filter.

13.3 HWE test

mkdir -p euro_global/output/files/hardyb;
cp euro_global/output/file4b.* euro_global/output/files/hardyb/

for fam in $(awk '{print $1}' euro_global/output/files/hardyb/file4b.fam | sort | uniq); 
do 
echo $fam | \
plink2 \
--allow-extra-chr \
--silent \
--keep-allele-order \
--bfile euro_global/output/files/hardyb/file4b \
--keep-fam /dev/stdin \
--make-bed \
--out euro_global/output/files/hardyb/$fam \
--hwe 0.000001 \
--geno 0.1 \
--write-snplist; \
done

cd /gpfs/gibbs/pi/caccone/mkc54/albo
cat euro_global/output/files/hardyb/*.snplist | awk '!a[$0]++' > euro_global/output/files/passed_hwe_b.txt;
wc -l euro_global/output/files/passed_hwe_b.txt

All 100367 variants passed HWE test. If any failed, we could remove the variants that did not pass HWE test, using the –extract flag, extracting only those that passed HWE.

13.5 LD pruning

Need to switch back to plink2

module load PLINK/2_avx2_20221024
#PLINK/1.9b_6.21-x86_64 => PLINK/2_avx2_20221024
plink2 --version

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file4b \
--extract euro_global/output/files/passed_hwe_b.txt \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNPb \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNPb.log

–indep-pairwise 5 1 0.1 708 samples (85 females, 70 males, 553 ambiguous; 708 founders) loaded from 100367 variants loaded from euro_global/output/file4b.bim. –extract: 100367 variants remaining. 100367 variants remaining after main filters. –indep-pairwise (27 compute threads): 34106/100367 variants removed.

13.6 Heterozygosity

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file4b \
--extract euro_global/output/indepSNPb.prune.in \
--het \
--out euro_global/output/R_checkb \
--silent;
grep 'variants' euro_global/output/R_checkb.log

100367 variants loaded from euro_global/output/file4b.bim. –extract: 66261 variants remaining. 66261 variants remaining after main filters.

#   find individuals with high heterozygosity                              ####
# import the data from Plink
het <- read.delim(
  here(
    "euro_global/output/R_checkb.het"
  ),
  head = TRUE
)
#
# check head of the file
colnames(het)

## [1] "X.FID"  "IID"    "O.HOM." "E.HOM." "OBS_CT" "F"

Estimate heterozygosity

# create a column named HET_RATE and calculate the heterozygosity rate
het$HET_RATE <- (het$"OBS_CT" - het$"O.HOM") / het$"OBS_CT"
#
# use subset function to get values deviating from 4sd of the mean heterozygosity rate.
het_fail <-
  subset(
    het, (het$HET_RATE < mean(
      het$HET_RATE
    ) - 4 * sd(
      het$HET_RATE
    )) |
      (het$HET_RATE > mean(
        het$HET_RATE
      ) + 4 * sd(
        het$HET_RATE
      ))
  )
#
# get the list of individuals that failed our threshold of 4sd from the mean.
het_fail$HET_DST <-
  (het_fail$HET_RATE - mean(
    het$HET_RATE
  )) / sd(
    het$HET_RATE
  )

Save the files to use with Plink

#   save the data to use with Plink2                                        ####
#
write.table(
  het_fail,
  here(
    "euro_global/output/fail-het-qc_b.txt"
  ),
  row.names = FALSE
)

Make plot

#   make a heterozygosity plot                                              ####
#
ggplot(
  het,
  aes(
    HET_RATE
  )
) +
  geom_histogram(
    colour           = "black",
    fill             = "lightgray",
    bins             = 40
  ) +
  labs(
    x                = "Heterozygosity Rate",
    y                = "Number of Individuals"
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      )
    ),
    col              = "red",
    linewidth        = 1.5
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) + 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) +
  geom_vline(
    aes(
      xintercept     = mean(
        HET_RATE
      ) - 4 * sd(
        HET_RATE
      )
    ),
    col              = "#BFB9B9",
    linewidth        = 1
  ) + 
  my_theme() +
  scale_y_continuous(
  )

#   save the heterozygosity plot                                            ####
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "figures", "Heterozygosity_euro_global_MAF_0.01.pdf"
  ),
  width  = 5,
  height = 4,
  units  = "in"
)

sed 's/"// g' euro_global/output/fail-het-qc_b.txt | awk '{print$1, $2}'> euro_global/output/het_fail_ind_b.txt;
echo 'How many mosquitoes we need to remove from our data set:';
cat euro_global/output/het_fail_ind_b.txt | tail -n +2 | wc -l;
echo 'Which mosquitoes we have to remove:';
tail -n +2 euro_global/output/het_fail_ind_b.txt

## How many mosquitoes we need to remove from our data set:
## 9
## Which mosquitoes we have to remove:
## QNC 1248
## KAT 601
## KAT 605
## KAT 606
## KAT 608
## GRA 734
## TUA 783
## ITP 832
## ROS 858

Heterozygosity is outside the SD threshold in 4 KAT individuals (601, 605, 606, 608), 1 QNC (1248), 1 GRA (734), 1 TUA (783), 1 ITP (832), and 1 ROS (858). We will remove these individuals.

Next, we will remove these mosquitoes from our data set using Plink:

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file4b \
--remove euro_global/output/het_fail_ind_b.txt \
--make-bed \
--out euro_global/output/file5b \
--silent;
grep 'variants\|samples' euro_global/output/file5b.log

708 samples (85 females, 70 males, 553 ambiguous; 708 founders) loaded from 100367 variants loaded from euro_global/output/file4b.bim. –remove: 699 samples remaining. 699 samples (85 females, 65 males, 549 ambiguous; 699 founders) remaining after

13.7 Relatedness

# Plink2 will create a file with extension .king
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5b \
--extract euro_global/output/indepSNPb.prune.in \
--make-king-table rel-check \
--king-table-filter 0.354 \
--out euro_global/output/file5b \
--silent;
grep 'variants\|samples' euro_global/output/file5b.log

Check the individuals that did not pass our filtering.

head -n 14 euro_global/output/file5b.kin0

## #FID1    IID1    FID2    IID2    NSNP    HETHET  IBS0    KINSHIP
## DES  1445    DES 1292    64075   0.234975    0.00380804  0.448174
## ITB  748 ITB 747 64403   0.233871    0.00909896  0.40836
## KAN  1327    KAN 1325    64296   0.209873    0.00824313  0.389784
## REC  179 REC 175 62826   0.216773    0.00733773  0.395894
## SIC  1231    SIC 1229    62126   0.234958    0.00276857  0.454991
## SIC  1235    SIC 1233    62573   0.222828    0.00663225  0.405652
## SIC  1235    SIC 1234    61890   0.205639    0.00891905  0.364829
## SIC  1236    SIC 1234    62221   0.212533    0.00861445  0.381113
## SPM  767 SPM 765 62477   0.217232    0.00496183  0.359038
## SPS  915 SPS 914 64196   0.194264    0.00954888  0.363257
## TUA  779 TUA 776 63800   0.216646    0.0111285   0.382029
## TUA  780 TUA 777 64051   0.220402    0.00864936  0.391972

We want to remove one of the individuals of the pairs.

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5b \
--extract euro_global/output/indepSNPb.prune.in \
--make-king triangle bin \
--out euro_global/output/file6b \
--silent;
grep 'variants\|samples' euro_global/output/file6b.log

699 samples (85 females, 65 males, 549 ambiguous; 699 founders) loaded from 100367 variants loaded from euro_global/output/file5b.bim. –extract: 66261 variants remaining. 66261 variants remaining after main filters. –make-king pass 1/1: Scanning for rare variants… done. 133 variants handled by initial scan (66128 remaining). –make-king: 66261 variants processed.

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5b \
--king-cutoff euro_global/output/file6b 0.354 \
--make-bed \
--out euro_global/output/file7b \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/file7b.log

699 samples (85 females, 65 males, 549 ambiguous; 699 founders) loaded from 100367 variants loaded from euro_global/output/file5b.bim. euro_global/output/file7b.king.cutoff.out.id , and 688 remaining sample IDs

13.8 Create chromosomal scale for MAF 0.01 files

Import the .bim file with the SNPs to create a new chromosomal scale.

#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "euro_global/output/file7b.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)

## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-583033342     0   315059 C       G      
## 2 1        AX-583035163     0   315386 A       G      
## 3 1        AX-583033356     0   315674 C       T      
## 4 1        AX-583033370     0   330057 G       A      
## 5 1        AX-583035194     0   330265 A       G      
## 6 1        AX-583033387     0   331288 C       T

Separate the tibbles into each chromosome

#   separate the SNP data per chromosome                                    
# chr1
chr1_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^1."
    )
  ) |> # here we get only Scaffold rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^2."
    )
  ) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^3."
    )
  ) |>
  as_tibble()

Import the file with sizes of each scaffold.

#   import the file with the scaffold sizes                                 ####
sizes <-
  read_delim(
    here(
      "/gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cd"
  )
#
# set column names
colnames(
  sizes
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <- 
  sizes |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  ) |>
  arrange(
    Scaffold
  )                   # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)

## # A tibble: 6 × 3
##   Scaffold     Size Chromosome
##   <chr>       <dbl> <chr>     
## 1 1.1        351198 1         
## 2 1.10     11939576 1         
## 3 1.100     3389100 1         
## 4 1.101      470438 1         
## 5 1.102     2525157 1         
## 6 1.103      150026 1

Create new scale. Get the scaffolds for each chromosome.

#   separate the scaffold sizes tibble per chromosome                       ####
# chr1
chr1_scaffolds <- 
  sizes |>
  filter(
    str_detect(
      Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
    )
  ) |> 
  as_tibble()
#
# chr2
chr2_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
    )
  ) |> 
  as_tibble()
#
# # chr3
chr3_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
    )
  ) |>
  as_tibble()

Create a scale for each chromosome.

#   create a new scale for each chromosome                                  ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
  0                                                                        # we create a new column with zeros
for (i in 2:nrow(
  chr1_scaffolds
)
) {                                                                        # loop to start on second line
  chr1_scaffolds$overall_size_before_bp[i] <-                              # set position on the scale
    chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
      1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr2_scaffolds
)
) {
  chr2_scaffolds$overall_size_before_bp[i] <-
    chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
      1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr3_scaffolds
)
) {
  chr3_scaffolds$overall_size_before_bp[i] <-
    chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
      1]
}

Merge the data frames scaffolds and SNPs.

#   merge the data sets using the tidyverse function left_join              ####
# chr1
chr1_scale <-
  chr1_snps |>          # create data frame for each chromosome, get chr1_snps
  left_join(            # use lef_join function to merge it with chr1_scaffolds
    chr1_scaffolds,
    by = "Scaffold"
  ) |>                  # set column to use for merging (Scaffold in this case)
  na.omit() |>          # remove NAs, we don't have SNPs in every scaffold
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +                 # make new columns numeric
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr2
chr2_scale <-
  chr2_snps |>
  left_join(
    chr2_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr3
chr3_scale <-
  chr3_snps |>
  left_join(
    chr3_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )

Merge all chromosome scales.

#   merge the data sets, and select only the columns we need                ####
chroms <- rbind(
  chr1_scale, chr2_scale, chr3_scale
) |>
  dplyr::select(
    Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
  )
# check it
head(chroms)

## # A tibble: 0 × 6
## # ℹ 6 variables: Chromosome <chr>, SNP <chr>, Cm <int>, midPos_fullseq <dbl>,
## #   Allele1 <chr>, Allele2 <chr>

Save the new .bim file

#   ____________________________________________________________________________
#   save the new bim file with a new name, I added "B"                      ####
write.table(
  chroms,
  file      = here(
    "euro_global/output/file7C.bim"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Rename the .bim files

# change the name of the first .bim file, for example, append _backup.bim, and then replace the original file
mv euro_global/output/file7b.bim euro_global/output/file7b_backup.bim;
# than change the new bim we create to the original name (do it only once, otherwise it will mess up)
mv euro_global/output/file7C.bim euro_global/output/file7b.bim

Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.

cd /gpfs/gibbs/pi/caccone/mkc54/albo

plink2 \
--bfile euro_global/output/file7b \
--make-bed \
--out euro_global/output/test01;
# then we remove the files 
rm euro_global/output/test01.*

PLINK v2.00a3.7LM AVX2 Intel (24 Oct 2022) www.cog-genomics.org/plink/2.0/ (C) 2005-2022 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to euro_global/output/test01.log. Options in effect: –bfile euro_global/output/file7b –make-bed –out euro_global/output/test01 Start time: Wed Jan 24 12:48:03 2024 257256 MiB RAM detected; reserving 128628 MiB for main workspace. Using up to 32 threads (change this with –threads). 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from euro_global/output/file7b.fam. 100367 variants loaded from euro_global/output/file7b.bim. Note: No phenotype data present. Writing euro_global/output/test01.fam … done. Writing euro_global/output/test01.bim … done. Writing euro_global/output/test01.bed … done. End time: Wed Jan 24 12:48:03 2024

13.18.1 Estimate LD blocks using chromosomal scale

mkdir -p euro_global/output/files/ld/blocks_chr_b

module load PLINK/1.9b_6.21-x86_64

We can use Plink1.9 to estimate LD blocks for the populations with more than 12 individuals. We will use the entire genome for this part instead of the larger scaffolds only. We will set max distance of LD blocks of 500kb. We found out that the average half distance of r^2 max is small, from 1 to 5kb

for fam in $(awk '{print $1}' euro_global/output/files/ld/pops_4ld.txt | sort | uniq); 
do 
echo $fam | \
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile euro_global/output/file7b \
--keep-fam /dev/stdin \
--maf 0.01 \
--blocks no-pheno-req \
--blocks-max-kb 200 \
--out euro_global/output/files/ld/blocks_chr_b/$fam \
--geno 0.1 \
--silent
done;
#
rm euro_global/output/files/ld/blocks_chr_b/*.nosex

Now we can get some data from our .log files

echo "Population,n,nVariants,geno,maf,passQC" > euro_global/output/files/ld/blocks_chr_b/table_ld_statsb.csv
for file in euro_global/output/files/ld/blocks_chr_b/*.log
do
  variants=$(grep -oE '([0-9]+) variants loaded from \.bim file' $file | grep -oE '[0-9]+')
  geno=$(grep -oE '([0-9]+) variants removed due to missing genotype data \(--geno\)' $file | grep -oE '[0-9]+')
  maf=$(grep -oE '([0-9]+) variants removed due to minor allele threshold\(s\)' $file | grep -oE '[0-9]+')
  pass=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | head -1)
  n=$(grep -oE '([0-9]+) variants and [0-9]+ people pass filters and QC\.' $file | grep -oE '[0-9]+' | tail -1)
  filename=$(basename $file .log)
  echo "$filename,$n,$variants,$geno,$maf,$pass" >> euro_global/output/files/ld/blocks_chr_b/table_ld_statsb.csv
done;
head -n 5 euro_global/output/files/ld/blocks_chr_b/table_ld_statsb.csv

## Population,n,nVariants,geno,maf,passQC
## ALU,12,100367,5574,16348,78445
## ALV,12,100367,4943,15364,80060
## BAR,12,100367,6633,18733,75001
## BEN,12,100367,7303,19822,73242

We can check it out

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "euro_global/output/files/ld/blocks_chr_b/table_ld_statsb.csv"
    ),
  header = TRUE,
  sep = ","
)

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 1: Summary of quality control for Euro_global dataset_b.")

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("euro_global/output/summary_blocks_chr_b.docx"))

ft

Table 1: Summary of quality control for Euro_global dataset_b.
Population	n	nVariants	geno	maf	passQC
ALU	12	100,367	5,574	16,348	78,445
ALV	12	100,367	4,943	15,364	80,060
BAR	12	100,367	6,633	18,733	75,001
BEN	12	100,367	7,303	19,822	73,242
BER	12	100,367	5,717	12,307	82,343
BRE	13	100,367	11,370	20,312	68,685
CAM	12	100,367	8,512	13,832	78,023
CES	14	100,367	17,525	21,757	61,085
CHA	12	100,367	5,415	17,268	77,684
CRO	12	100,367	5,121	15,550	79,696
DES	16	100,367	19,808	14,223	66,336
FRS	12	100,367	4,063	12,196	84,108
GES	12	100,367	4,671	22,794	72,902
GRA	11	100,367	4,895	11,957	83,515
GRV	12	100,367	6,563	20,180	73,624
HAI	12	100,367	4,842	11,241	84,284
HUN	12	100,367	4,322	10,144	85,901
ITR	12	100,367	4,011	13,036	83,320
KAG	12	100,367	5,815	16,681	77,871
KAN	11	100,367	5,188	23,833	71,346
KER	12	100,367	4,375	18,248	77,744
KRA	12	100,367	6,944	15,678	77,745
MAL	12	100,367	4,216	11,364	84,787
MAT	12	100,367	5,785	18,986	75,596
OKI	12	100,367	6,604	19,839	73,924
POP	12	100,367	4,363	10,711	85,293
QNC	11	100,367	6,335	34,788	59,244
RAR	12	100,367	4,566	23,006	72,795
REC	11	100,367	9,490	17,150	73,727
ROS	11	100,367	5,609	17,909	76,849
SEV	12	100,367	4,923	26,279	69,165
SIC	9	100,367	21,294	15,310	63,763
SLO	12	100,367	7,692	10,112	82,563
SOC	12	100,367	4,661	20,282	75,424
SSK	12	100,367	5,611	17,665	77,091
STS	12	100,367	5,030	13,379	81,958
TIK	12	100,367	4,662	23,951	71,754
TRE	12	100,367	4,066	10,074	86,227
TUA	9	100,367	13,966	18,077	68,324
TUH	12	100,367	5,217	13,641	81,509
UTS	12	100,367	4,736	16,562	79,069

Now we can count how many blocks we found in each population

cd euro_global/output/files/ld/blocks_chr_b
wc -l *.blocks | \
  awk '{population = gensub(/\.blocks_chr/, "", "g", $2); print population "\t" $1}' | \
  sed 's#euro_global/output/files/ld/blocks_chr_b/##' | \
  sed 's/.blocks//' | \
  sed '$d' > populations_block_counts_b.csv;
head -n 79 populations_block_counts_b.csv

## ALU  80
## ALV  90
## BAR  107
## BEN  11
## BER  98
## BRE  286
## CAM  9
## CES  381
## CHA  8
## CRO  96
## DES  378
## FRS  59
## GES  166
## GRA  31
## GRV  168
## HAI  59
## HUN  42
## ITR  82
## KAG  123
## KAN  221
## KER  128
## KRA  83
## MAL  79
## MAT  14
## OKI  140
## POP  72
## QNC  23
## RAR  148
## REC  81
## ROS  64
## SEV  206
## SIC  0
## SLO  42
## SOC  124
## SSK  7
## STS  115
## TIK  187
## TRE  71
## TUA  2
## TUH  119
## UTS  145

Now we can add the number of blocks to the table we made

# Load data from the table_ld_stats.csv file
ld_blocks <- read.csv(
  here(
    "euro_global/output/files/ld/blocks_chr_b/table_ld_statsb.csv"
    ),
  header = TRUE,
  sep = ","
)

# Load the population counts data from the CSV file
pop_counts <-
  read.delim(
    here(
      "euro_global/output/files/ld/blocks_chr_b/populations_block_counts_b.csv"
    ),
    header = F,
    sep = "\t"
  ) |>
  rename(
    Population       = 1,
    nBlocks          = 2
  )

# Merge the population counts with the table data
ld_blocks <- merge(ld_blocks, pop_counts, by = "Population")

# Create the flextable
ft <- flextable(ld_blocks)

# Apply zebra theme
ft <- theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.")

ft

Table 2: Number of linkage blocks detected with Plink for populations with at least 10 individuals.
Population	n	nVariants	geno	maf	passQC	nBlocks
ALU	12	100,367	5,574	16,348	78,445	80
ALV	12	100,367	4,943	15,364	80,060	90
BAR	12	100,367	6,633	18,733	75,001	107
BEN	12	100,367	7,303	19,822	73,242	11
BER	12	100,367	5,717	12,307	82,343	98
BRE	13	100,367	11,370	20,312	68,685	286
CAM	12	100,367	8,512	13,832	78,023	9
CES	14	100,367	17,525	21,757	61,085	381
CHA	12	100,367	5,415	17,268	77,684	8
CRO	12	100,367	5,121	15,550	79,696	96
DES	16	100,367	19,808	14,223	66,336	378
FRS	12	100,367	4,063	12,196	84,108	59
GES	12	100,367	4,671	22,794	72,902	166
GRA	11	100,367	4,895	11,957	83,515	31
GRV	12	100,367	6,563	20,180	73,624	168
HAI	12	100,367	4,842	11,241	84,284	59
HUN	12	100,367	4,322	10,144	85,901	42
ITR	12	100,367	4,011	13,036	83,320	82
KAG	12	100,367	5,815	16,681	77,871	123
KAN	11	100,367	5,188	23,833	71,346	221
KER	12	100,367	4,375	18,248	77,744	128
KRA	12	100,367	6,944	15,678	77,745	83
MAL	12	100,367	4,216	11,364	84,787	79
MAT	12	100,367	5,785	18,986	75,596	14
OKI	12	100,367	6,604	19,839	73,924	140
POP	12	100,367	4,363	10,711	85,293	72
QNC	11	100,367	6,335	34,788	59,244	23
RAR	12	100,367	4,566	23,006	72,795	148
REC	11	100,367	9,490	17,150	73,727	81
ROS	11	100,367	5,609	17,909	76,849	64
SEV	12	100,367	4,923	26,279	69,165	206
SIC	9	100,367	21,294	15,310	63,763	0
SLO	12	100,367	7,692	10,112	82,563	42
SOC	12	100,367	4,661	20,282	75,424	124
SSK	12	100,367	5,611	17,665	77,091	7
STS	12	100,367	5,030	13,379	81,958	115
TIK	12	100,367	4,662	23,951	71,754	187
TRE	12	100,367	4,066	10,074	86,227	71
TUA	9	100,367	13,966	18,077	68,324	2
TUH	12	100,367	5,217	13,641	81,509	119
UTS	12	100,367	4,736	16,562	79,069	145

# Save it to a Word document
officer::read_docx() |>
  body_add_flextable(ft) |>
  print(target = here::here("euro_global/output/summary_blocks_chr_b.docx"))

Get the size of each block from the .block.det files

get_kb_column <- function(dir_path) {
  # obtain the list of files with extension .blocks.det
  file_names <- list.files(path = dir_path, pattern = "\\.blocks\\.det$", full.names = TRUE)
  
  # create an empty list to hold the data frames
  block_list <- list()
  
  # loop through the files and read the data into the list
  for (file in file_names) {
    df <- read.table(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
    # select only the KB column and add it to the block_list with the file name
    block_list[[file]] <- df %>% dplyr::select(KB) %>% add_column(file = file, .before = 1)
  }
  
  # combine the data frames in the block_list into a single data frame
  blocks <- bind_rows(block_list)
  
  # clean up the file name column
  blocks$file <- str_remove(blocks$file, "^.*\\/ld\\/blocks\\/")
  
  return(blocks)
}

# example usage: replace dir_path with your directory path
dir_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/files/ld/blocks_chr_b")

blocks<- 
  get_kb_column(dir_path) |>
  mutate(file = str_remove(file, "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/files/ld/blocks_chr_b/")) |>
  mutate(file = str_remove(file, ".blocks.det")) |>

  as_tibble() |>
  rename(
    Population = 1
  )

Create density plot of the size of the LD blocks Plink found

# to check how many colors we need
# n_distinct(blocks$Population) #24

source(
  here("scripts", "RMarkdowns",
    "my_theme2.R"
  )
)

n <- 50
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
pie(rep(1,n), col=sample(col_vector, n))

col_vector

##  [1] "#7FC97F" "#BEAED4" "#FDC086" "#FFFF99" "#386CB0" "#F0027F" "#BF5B17"
##  [8] "#666666" "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E" "#E6AB02"
## [15] "#A6761D" "#666666" "#A6CEE3" "#1F78B4" "#B2DF8A" "#33A02C" "#FB9A99"
## [22] "#E31A1C" "#FDBF6F" "#FF7F00" "#CAB2D6" "#6A3D9A" "#FFFF99" "#B15928"
## [29] "#FBB4AE" "#B3CDE3" "#CCEBC5" "#DECBE4" "#FED9A6" "#FFFFCC" "#E5D8BD"
## [36] "#FDDAEC" "#F2F2F2" "#B3E2CD" "#FDCDAC" "#CBD5E8" "#F4CAE4" "#E6F5C9"
## [43] "#FFF2AE" "#F1E2CC" "#CCCCCC" "#E41A1C" "#377EB8" "#4DAF4A" "#984EA3"
## [50] "#FF7F00" "#FFFF33" "#A65628" "#F781BF" "#999999" "#66C2A5" "#FC8D62"
## [57] "#8DA0CB" "#E78AC3" "#A6D854" "#FFD92F" "#E5C494" "#B3B3B3" "#8DD3C7"
## [64] "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69" "#FCCDE5"
## [71] "#D9D9D9" "#BC80BD" "#CCEBC5" "#FFED6F"

# make plot using the sample y scale for all populations
ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3) + my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("scripts", "RMarkdowns",
    "output",
    "euro_global",
    "figures",
    "block_density_y_scale_fixed_chr_euro_global_b.pdf"
  ),
  width = 10,
  height = 10,
  units = "in"
)

Make plot allowing the y axis scale free

ggplot(blocks, aes(x = KB)) +
  stat_density(
    aes(y = after_stat(count), fill = factor(Population)),
    linewidth = .5,
    alpha = .4,
    position = "identity"
  ) +
  scale_fill_manual(values = col_vector) +
  scale_x_continuous(name = "Block length (kb)") +
  scale_y_continuous(name = "Count") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 16, face = "bold"),
    strip.text = element_text(size = 14, face = "bold"),
    plot.background = element_rect(fill = "white"),
    panel.background = element_rect(fill = 'white', colour = 'black')
  ) +
  guides(fill = "none") +
  facet_wrap( ~ Population, ncol = 3, scales = "free_y") +
  my_theme()

# save the plot as a PDF using ggsave
ggsave(
  here("scripts", "RMarkdowns",
    "output",
    "euro_global",
    "figures",
    "block_density_free_scale_chr_euro_global_b.pdf"
  ),
  width = 20,
  height = 18,
  units = "in"
)

13.9 Plot SNP density

After quality control with approximately 100k SNPs

# load the function that we saved earlier
source(
  here("scripts", "RMarkdowns",
    "analyses", "import_bim.R"
  ),
  local = knitr::knit_global()
)
# import the file
snp_den_qc <- import_bim(
  here(
    "euro_global/output/file7b.bim"
  )
)

Make plot of the SNP density

#   ____________________________________________________________________________
#   plot SNP density after QC                                               ####
snp_den_qc |>
  rename(
    Chromosome = 1
  ) |>
  mutate(
    Position           = as.numeric(
      Position
    )
  ) |>
  ggplot(
    aes(
      x                = Position
    ),
    label              = sprintf(
      "%0.2f",
      round(
        a,
        digits         = 0
      )
    )
  ) +
  geom_histogram(
    aes(
      y                = after_stat(
        count
      )
    ),
    binwidth           = 1e6
  ) +
  facet_wrap(
    vars(
      Chromosome
    ),
    scales             = "free_x"
  ) +
  labs(
    title              = "SNP Density after QC",
    x                  = expression(
      "Position in the genome (Mb)"
    ),
    y                  = expression(
      "Number of SNPs"
    )
  ) +
  scale_x_continuous(
    labels             = function(x) {
      format(
        x / 1e6,
        big.mark       = ",", 
        scientific     = FALSE
      )
    }
  ) +
  geom_density(
  aes(
    y = 1e6 * after_stat(count)
  ),
  color = "red",
  linewidth = .75,
  alpha = .4,
  fill = "pink"
  ) +
    theme(
    panel.grid.major   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.grid.minor   = element_line(
      linetype         = "dashed",
      linewidth        = 0.2
    ),
    panel.spacing      = unit(0.5, "lines"),
    strip.text         = element_text(
      face             = "bold", hjust = .5
    ),
    strip.background.x = element_rect(
      color            = "gray"
    )
  )

#   save the density plot                                            ####
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global","figures", "snp_density_after_qc_euro_globalb.pdf"
  ),
  width  = 10,
  height = 6,
  units  = "in"
)

SNPs per chromosome (after QC)

# we can use dplyr "count" to get the number of SNPs for each chromosome
# lets get the data we need
snps_per_chrm <- 
  snp_den_qc |>
  count(
    Scaffold) |>
  rename(
    Chromosome = 1,
    "SNPs (N) " = 2
  )

# Create the flextable
ft <- flextable::flextable(snps_per_chrm)

# Apply zebra theme
ft <- flextable::theme_zebra(ft)

# Add a caption to the table
ft <- flextable::add_header_lines(ft, "SNPs per chromosome after quality control")
ft

SNPs per chromosome after quality control
Chromosome	SNPs (N)
1	22,313
2	42,102
3	35,952

We can get the mean number of SNPs per chromosome for the entire genome

# we first use dplyr cut_width to get the number of SNPs per 1Mb window
albo_den <- 
  snp_den_qc |>
  dplyr::select(
    Scaffold, Position
  ) |>
  group_by(
    Scaffold,
    windows               = cut_width(
      Position,
      width               = 1e6,
      boundary            = 0
    )
  ) |>
  summarise(
    n                     = n(),
    .groups               = "keep"
  ) |>
  group_by(
    Scaffold
  ) |>
  summarise(
    mean                  = mean(n),
    n                     = n(),
    .groups               = "keep"
  ) |>
  rename(
    Chromosome            = 1,
    "SNPs per 1Mb window" = 2,
    "Number of windows"   = 3
  )
#
# check the results
snp_table <-
  flextable(
    albo_den
  )
snp_table <- colformat_double(
  x        = snp_table,
  big.mark = ",",
  digits   = 2,
  na_str   = "N/A"
)
snp_table

Chromosome	SNPs per 1Mb window	Number of windows
1	60.47	369
2	72.46	581
3	73.52	489

Merge objects

# we can merge the two data sets we created above into one table
after_qc <-
  snps_per_chrm |>
  left_join(
    albo_den,
    by = "Chromosome"
  )
snp_table2 <- flextable(
  after_qc)
snp_table2 <- colformat_double(
  x        = snp_table2,
  big.mark = ",",
  digits   = 2, 
  na_str   = "N/A"
  )
snp_table2

Chromosome	SNPs (N)	SNPs per 1Mb window	Number of windows
1	22,313	60.47	369
2	42,102	72.46	581
3	35,952	73.52	489

13.10 LD pruning using the chromosomal scale for MAF 0.01

# we set a window of variants of 5 and move the window 1 variant per time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. Try --indep-pairwise 200kb 1 0.1
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file7b \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP_chrb \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_chrb.log

–indep-pairwise 5 1 0.1 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 100367 variants loaded from euro_global/output/file7b.bim. –indep-pairwise (3 compute threads): 34050/100367 variants removed.

Lets do the scaffold again

First need to import same SNP list Import the .bim file with the SNPs

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "euro_global/output/file7b_backup.bim" #output/populations/file4.bim
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)

## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1.1      AX-583033342     0   315059 C       G      
## 2 1.1      AX-583035163     0   315386 A       G      
## 3 1.1      AX-583033356     0   315674 C       T      
## 4 1.1      AX-583033370     0   330057 G       A      
## 5 1.1      AX-583035194     0   330265 A       G      
## 6 1.1      AX-583033387     0   331288 C       T

write.table(
  snps,
  file      = here(
    "euro_global/output/snps_all_scaffolds_4ld_b.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

nrow(snps)
#100367

nrow(snps)

## [1] 100367

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file5b \
--king-cutoff euro_global/output/file6b 0.354 \
--make-bed \
--out euro_global/output/file11b \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/file11b.log

699 samples (85 females, 65 males, 549 ambiguous; 699 founders) loaded from 100367 variants loaded from euro_global/output/file5b.bim. euro_global/output/file11b.king.cutoff.out.id , and 688 remaining sample IDs

# we set a window of variants of 5 and move the window 1 variant each time, removing 1 of the variants with lowest MAF from a pair above the threshold of r^2 > 0.1
# the mean distance is 203kb across the tested populations. We used --indep-pairwise 5 1 0.1 before. We can use the same values from the mean half distance max r2
plink2 \
--allow-extra-chr \
--bfile euro_global/output/file11b \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP_scaffoldsb \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_scaffoldsb.log

–indep-pairwise 5 1 0.1 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 100367 variants loaded from euro_global/output/file11b.bim. –indep-pairwise (27 compute threads): 34032/100367 variants removed.

Now we can compare the two sets of SNPs using scaffolds or chromosomal scale

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr <-
  read_delim(
    here(
      "euro_global/output/indepSNP_chrb.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_scaffolds <-
  read_delim(
    here(
      "euro_global/output/indepSNP_scaffoldsb.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_scaffolds <- unlist(prunout_scaffolds)
prunout_chr <- unlist(prunout_chr)


# Calculate shared values
prunout <- intersect(prunout_chr, prunout_scaffolds)

# Create Venn diagram
venn_data1 <- list(
  "Chromosomal" = prunout_chr,
  "Scaffolds" = prunout_scaffolds
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of genomic scales for linked SNPs MAF 0.01") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("scripts", "RMarkdowns", "output", "euro_global", "figures", "SNPs_linked_comparison_euro_global_b.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

Now compare MAF 0.01 to MAF 0.1 pruning

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr_0.01 <-
  read_delim(
    here(
      "euro_global/output/indepSNP_chrb.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_chr_0.1 <-
  read_delim(
    here(
      "euro_global/output/indepSNP_chr.prune.out"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_chr_0.01 <- unlist(prunout_chr_0.01)
prunout_chr_0.1 <- unlist(prunout_chr_0.1)


# Calculate shared values
prunout <- intersect(prunout_chr_0.01, prunout_chr_0.1)

# Create Venn diagram
venn_data1 <- list(
  "MAF 1%" = prunout_chr_0.01,
  "MAF 10%" = prunout_chr_0.1
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of SNPs sets pruned for MAF 1% and MAF 10%") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("scripts", "RMarkdowns", "output", "euro_global", "figures", "SNPs_comparison_euro_global_MAFs.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

13.11. Pruning r2 for LD-pruned datasets with MAF 1%

If needed, can create it again using the code below

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file7b \
--indep-pairwise 5 1 0.1 \
--out euro_global/output/indepSNP_chrb \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_chrb.log

The indepSNP_chrb.prune.in file produced here = those SNPs < our LD 0.1 threshold, that we want to use

Now we need to make pruned dataset for 0.01. Can use file7b

plink2 \
--allow-extra-chr \
--bfile euro_global/output/file7b \
--indep-pairwise 5 1 0.01 \
--out euro_global/output/indepSNP_chrb_r01 \
--silent;
grep 'pairwise\|variants\|samples' euro_global/output/indepSNP_chrb_r01.log

–indep-pairwise 5 1 0.01 688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 100367 variants loaded from euro_global/output/file7b.bim. –indep-pairwise (3 compute threads): 77725/100367 variants removed.

We can use the SNP set the LD pruning we just did and extract them from file7

plink2 \
--keep-allele-order \
--bfile euro_global/output/file7b \
--make-bed \
--export vcf \
--out euro_global/output/snps_sets/r2_0.1_b \
--extract euro_global/output/indepSNP_chrb.prune.in \
--silent
grep "variants\|samples" euro_global/output/snps_sets/r2_0.1_b.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 100367 variants loaded from euro_global/output/file7b.bim. –extract: 66317 variants remaining. 66317 variants remaining after main filters.

Repeat for 0.01 dataset (Set 3)

plink2 \
--keep-allele-order \
--bfile euro_global/output/file7b \
--make-bed \
--export vcf \
--out euro_global/output/snps_sets/r2_0.01_b \
--extract euro_global/output/indepSNP_chrb_r01.prune.in \
--silent
grep "variants\|samples" euro_global/output/snps_sets/r2_0.01_b.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 100367 variants loaded from euro_global/output/file7b.bim. –extract: 22642 variants remaining. 22642 variants remaining after main filters.

These snp set can now be used for subsequent pop gen analyses.

Create Venn diagram of SNPs removed due to LD

# Read in the two files as vectors
prunout_chr_0.01 <-
  read_delim(
    here(
      "euro_global/output/indepSNP_chrb_r01.prune.in"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

prunout_chr_0.1 <-
  read_delim(
    here(
      "euro_global/output/indepSNP_chr_r01.prune.in"
    ),
    delim = " ",
    col_names = FALSE,
    show_col_types = FALSE
  )

# Convert the list to vector
prunout_chr_0.01 <- unlist(prunout_chr_0.01)
prunout_chr_0.1 <- unlist(prunout_chr_0.1)


# Calculate shared values
prunout <- intersect(prunout_chr_0.01, prunout_chr_0.1)

# Create Venn diagram
venn_data1 <- list(
  "MAF 1%" = prunout_chr_0.01,
  "MAF 10%" = prunout_chr_0.1
)

# create plot
venn_plot1 <- ggvenn(venn_data1, fill_color = c("steelblue", "darkorange"), show_percentage = TRUE)

# Add a title
venn_plot1 <- venn_plot1 +
  ggtitle("Comparison of LD 0.01 SNPs sets for MAF 1% and MAF 10%") +
  theme(plot.title = element_text(hjust = .5))

# Display the Venn diagram
print(venn_plot1)

# Save Venn diagram to PDF
output_path <- here("scripts", "RMarkdowns","output", "euro_global", "figures", "SNPs_comparison_euro_global_MAFs_r2_0.01.pdf")
ggsave(output_path, venn_plot1, height = 5, width = 5, dpi = 300)

QC for Ae. albopictus Global dataset

Margaret Corley

2023-Dec-1

1. Load R libraries and software

1.2 Get software for quality control

1.3 Check if the data is in the correct location

2. The data

2.1 Use Plink2 to convert to bed format

2.2 Use R to update the .fam file

2.3 Remove duplicates

2.4 Checking the number of samples and localities

3. Quality control steps

3.1 Missingness

3.2 Minor allele frequency

3.3 HWE test

3.4 Linkage analysis

3.4.1 Half distance r^2 max

3.4.2 Estimate LD blocks using scaffolds

3.5 LD pruning

3.6 Heterozygosity

3.7 Relatedness

4. Create chromosomal scale

4.1 Estimate LD blocks using chromosomal scale

6. Plot SNP density

9. LD pruning using the chromosomal scale

11. Pruning r2 for LD-pruned datasets

12. Run PCA with LD-pruned dataset (for SNP Set 2)

13. Make a new snp set with Minor Allele Frequency of 1% (MAF 0.01)

13.2 Minor allele frequency

13.3 HWE test

13.5 LD pruning

13.6 Heterozygosity

13.7 Relatedness

13.8 Create chromosomal scale for MAF 0.01 files

13.18.1 Estimate LD blocks using chromosomal scale

13.9 Plot SNP density

13.10 LD pruning using the chromosomal scale for MAF 0.01

13.11. Pruning r2 for LD-pruned datasets with MAF 1%