1. Load libraries

library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.1     ✔ stringr 1.5.1
## ✔ readr   2.1.5     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns

Prepare the data

Mosquitoes removed due excess heterozygosity

tail -n +2 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/het_fail_ind.txt
## GRA 734
## TUA 783
## ITP 829
## ITP 832
## ROS 858

Mosquitoes removed duo to relatedness

head -n 10 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/file7.king.cutoff.out.id
## #FID IID
## SIC  1231
## SIC  1235
## SIC  1236
## DES  1445
## ITB  748
## SPM  767
## TUA  779
## TUA  780
## SPS  914

Create a list of individuals to remove

echo 'GRA 734
TUA 783
ITP 829
ITP 832
ROS 858
SIC 1231
SIC 1235
SIC 1236
DES 1445
ITB 748
SPM 767
TUA 779
TUA 780
SPS 914' > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/remove.txt
module load PLINK/2_avx2_20221024

Create file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

plink2 \
--allow-extra-chr \
--remove output/snps_sets/linkage/remove.txt \
--bfile output/snps_sets/linkage/file3 \
--make-bed \
--out output/snps_sets/linkage/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/ld1.log

423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/file3.bim. –remove: 409 samples remaining. 409 samples (41 females, 1 male, 367 ambiguous; 409 founders) remaining after

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.fam | sort | uniq -c | awk '{print $2, $1}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BRE 13
## BUL 10
## CES 14
## CRO 12
## DES 16
## FRS 12
## GES 12
## GRA 11
## GRC 10
## IMP 4
## ITB 5
## ITP 8
## ITR 12
## KER 12
## KRA 12
## MAL 12
## POL 2
## POP 12
## RAR 12
## ROM 4
## ROS 11
## SCH 5
## SER 4
## SEV 12
## SIC 9
## SLO 12
## SOC 12
## SPB 8
## SPC 6
## SPM 5
## SPS 8
## STS 7
## TIK 12
## TIR 4
## TRE 12
## TUA 9
## TUH 12

Create list of populations

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 6 {print}' | awk '{print $1}' > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/pops_4ld.txt;
head -n100 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/pops_4ld.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## ITP
## ITR
## KER
## KRA
## MAL
## POP
## RAR
## ROS
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPS
## STS
## TIK
## TRE
## TUA
## TUH

ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC ITP ITR KER KRA MAL POP RAR ROS SEV SIC SLO SOC SPB SPC SPS STS TIK TRE TUA TUH

2. Create chromosomal scale

Import the .bim file with the SNPs to create a new chromosomal scale.

#   ____________________________________________________________________________
#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Scaffold SNP             Cm Position Allele1 Allele2
##   <chr>    <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1        AX-583033226     0   161729 A       G      
## 2 1        AX-583033342     0   315059 C       G      
## 3 1        AX-583035163     0   315386 A       G      
## 4 1        AX-583033356     0   315674 C       T      
## 5 1        AX-583033370     0   330057 G       A      
## 6 1        AX-583035194     0   330265 A       G

Separate the tibbles into each chromosome

#   separate the SNP data per chromosome                                    
# chr1
chr1_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^1."
    )
  ) |> # here we get only Scaffold rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^2."
    )
  ) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(
    str_detect(
      Scaffold, "^3."
    )
  ) |>
  as_tibble()

Import the file with sizes of each scaffold.

#   import the file with the scaffold sizes                                 ####
sizes <-
  read_delim(
    here(
      "/gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "cd"
  )
#
# set column names
colnames(
  sizes
) <- c(
  "Scaffold", "Size"
)
#   ____________________________________________________________________________
#   create new column with the chromosome number                            ####
sizes <- 
  sizes |>
  mutate(
    Chromosome = case_when( # we use mutate to create a new column called Chromosome
      startsWith(
        Scaffold, "1"
      ) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
      startsWith(
        Scaffold, "2"
      ) ~ "2",
      startsWith(
        Scaffold, "3"
      ) ~ "3"
    )
  ) |>
  arrange(
    Scaffold
  )                   # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)
## # A tibble: 6 × 3
##   Scaffold     Size Chromosome
##   <chr>       <dbl> <chr>     
## 1 1.1        351198 1         
## 2 1.10     11939576 1         
## 3 1.100     3389100 1         
## 4 1.101      470438 1         
## 5 1.102     2525157 1         
## 6 1.103      150026 1

Create new scale. Get the scaffolds for each chromosome.

#   separate the scaffold sizes tibble per chromosome                       ####
# chr1
chr1_scaffolds <- 
  sizes |>
  filter(
    str_detect(
      Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
    )
  ) |> 
  as_tibble()
#
# chr2
chr2_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
    )
  ) |> 
  as_tibble()
#
# # chr3
chr3_scaffolds <-
  sizes |>
  filter(
    str_detect(
      Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
    )
  ) |>
  as_tibble()

Create a scale for each chromosome.

#   create a new scale for each chromosome                                  ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
  0                                                                        # we create a new column with zeros
for (i in 2:nrow(
  chr1_scaffolds
)
) {                                                                        # loop to start on second line
  chr1_scaffolds$overall_size_before_bp[i] <-                              # set position on the scale
    chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
      1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr2_scaffolds
)
) {
  chr2_scaffolds$overall_size_before_bp[i] <-
    chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
      1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
  chr3_scaffolds
)
) {
  chr3_scaffolds$overall_size_before_bp[i] <-
    chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
      1]
}

Merge the data frames scaffolds and SNPs.

#   merge the data sets using the tidyverse function left_join              ####
# chr1
chr1_scale <-
  chr1_snps |>          # create data frame for each chromosome, get chr1_snps
  left_join(            # use lef_join function to merge it with chr1_scaffolds
    chr1_scaffolds,
    by = "Scaffold"
  ) |>                  # set column to use for merging (Scaffold in this case)
  na.omit() |>          # remove NAs, we don't have SNPs in every scaffold
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +                 # make new columns numeric
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr2
chr2_scale <-
  chr2_snps |>
  left_join(
    chr2_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )
#
# chr3
chr3_scale <-
  chr3_snps |>
  left_join(
    chr3_scaffolds,
    by = "Scaffold"
  ) |>
  na.omit() |>
  mutate(
    midPos_fullseq = as.numeric(
      Position
    ) +
      as.numeric(
        overall_size_before_bp
      )
  )

Merge all chromosome scales.

#   merge the data sets, and select only the columns we need                ####
chroms <- rbind(
  chr1_scale, chr2_scale, chr3_scale
) |>
  dplyr::select(
    Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
  )
# check it
head(chroms)
## # A tibble: 0 × 6
## # ℹ 6 variables: Chromosome <chr>, SNP <chr>, Cm <int>, midPos_fullseq <dbl>,
## #   Allele1 <chr>, Allele2 <chr>

Save the new .bim file

#   save the new bim file with a new name, I added "B"                      ####
write.table(
  chroms,
  file      = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.bim"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.

module load PLINK/2_avx2_20221024
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink2 \
--bfile output/snps_sets/linkage/ld1 \
--make-bed \
--out output/snps_sets/linkage/test01;
# then we remove the files 
rm output/snps_sets/linkage/test01.*

Start time: Wed Nov 1 16:25:59 2023 257256 MiB RAM detected; reserving 128628 MiB for main workspace. Using up to 32 threads (change this with –threads). 409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from output/snps_sets/linkage/ld1.fam. 107776 variants loaded from output/snps_sets/linkage/ld1.bim. Note: No phenotype data present. Writing output/snps_sets/linkage/test01.fam … done. Writing output/snps_sets/linkage/test01.bim … done. Writing output/snps_sets/linkage/test01.bed … done. End time: Wed Nov 1 16:25:59 2023

If no warnings from Plink2. Now, we can go ahead with our analysis.

3. Chromosomal level LD estimates

We will use MAF 0.005 for each chromosome (default from PopLDdecay) Clean env and memory

We can estimate LD for each chromosome separated.

Import the .bim file with the SNPs to create a new chromosomal scale.

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm Position Allele1 Allele2
##   <chr>      <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1          AX-583033226     0   161729 A       G      
## 2 1          AX-583033342     0   315059 C       G      
## 3 1          AX-583035163     0   315386 A       G      
## 4 1          AX-583033356     0   315674 C       T      
## 5 1          AX-583033370     0   330057 G       A      
## 6 1          AX-583035194     0   330265 A       G

Separate the tibbles into each chromosome.

# chr1
chr1_snps <-
  snps |>
  filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(Chromosome == 2) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(Chromosome == 3) |>
  as_tibble()

We have objects with the SNPs for each chromosome

head(chr1_snps)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm Position Allele1 Allele2
##   <chr>      <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1          AX-583033226     0   161729 A       G      
## 2 1          AX-583033342     0   315059 C       G      
## 3 1          AX-583035163     0   315386 A       G      
## 4 1          AX-583033356     0   315674 C       T      
## 5 1          AX-583033370     0   330057 G       A      
## 6 1          AX-583035194     0   330265 A       G

Filter the data by chromosome

#Chromosome 1
write.table(
  chr1_snps$SNP,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1_ld_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

#Chromosome 2

write.table(
  chr2_snps$SNP,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2_ld_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

#Chromosome 3

write.table(
  chr3_snps$SNP,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3_ld_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Now create new files for each chromosome

Create dirs

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

# make directory
mkdir -p output/snps_sets/linkage/chr1;
mkdir -p output/snps_sets/linkage/chr2;
mkdir -p output/snps_sets/linkage/chr3;

Chromosome 1

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/ld1 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr1_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr1/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr1/ld1.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/ld1.bim. –extract: 23976 variants remaining. –keep-fam: 376 samples remaining. 376 samples (36 females, 1 male, 339 ambiguous; 376 founders) remaining after 1614 variants removed due to allele frequency threshold(s) 22362 variants remaining after main filters.

Chromosome 2

plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/ld1 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr2_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr2/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr2/ld1.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/ld1.bim. –extract: 45220 variants remaining. –keep-fam: 376 samples remaining. 376 samples (36 females, 1 male, 339 ambiguous; 376 founders) remaining after 3154 variants removed due to allele frequency threshold(s) 42066 variants remaining after main filters.

Chromosome 3

plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/ld1 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr3_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr3/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr3/ld1.log

409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/ld1.bim. –extract: 38580 variants remaining. –keep-fam: 376 samples remaining. 376 samples (36 females, 1 male, 339 ambiguous; 376 founders) remaining after 2544 variants removed due to allele frequency threshold(s) 36036 variants remaining after main filters.

Check the vcfs

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
ls output/snps_sets/linkage/chr*/ld1.vcf

3.1. Chromosome 1

3.1.1. run PopLDdecay

Prepare the files required by PopLDdecay

module spider bcftools
#BCFtools/1.16-GCCcore-10.2.0
module load BCFtools/1.16-GCCcore-10.2.0
bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf > output/snps_sets/linkage/vcf_samples.txt

bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf | cut -d'_' -f1 | sort | uniq > output/snps_sets/linkage/unique_pops_from_vcf.txt
while read pop; do
    grep "^${pop}_" output/snps_sets/linkage/vcf_samples.txt > output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

Create conda environment for PopLDdecay

module load miniconda
conda create -n PopLDdecay-env python pip

conda activate PopLDdecay-env
pip install PopLDdecay
cd PopLDdecay; chmod 755 configure; ./configure;
       make;
       mv PopLDdecay  bin/;    #     [rm *.o]
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
 Versions:
    Perl/5.28.0-GCCcore-7.3.0
    Perl/5.32.0-GCCcore-10.2.0-minimal
    Perl/5.32.0-GCCcore-10.2.0
    Perl/5.32.1-GCCcore-10.2.0
    Perl/5.36.0-GCCcore-12.2.0
    Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr1/ld1.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr1/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4653
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        16989
# ##SampleSize*2 =        20
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3350
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        18893
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3139
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19130
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3395
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        18735
# ##SampleSize*2 =        20
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4501
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17502
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 13
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4727
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        16763
# ##SampleSize*2 =        26
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4048
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17969
# ##SampleSize*2 =        20
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 14
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4307
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5385
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        15545
# ##SampleSize*2 =        28
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3070
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19188
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 16
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3178
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        18325
# ##SampleSize*2 =        32
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2256
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        20036
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4676
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17568
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 11
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2573
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19372
# ##SampleSize*2 =        22
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2768
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19294
# ##SampleSize*2 =        20
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 8
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2907
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19209
# ##SampleSize*2 =        16
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2535
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19754
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4107
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        18153
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2969
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19150
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2137
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        20146
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2088
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        20198
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5220
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17000
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 11
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3431
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        18509
# ##SampleSize*2 =        22
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5457
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        16749
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 9
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4094
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17469
# ##SampleSize*2 =        18
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :1836
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        20261
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4848
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17336
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 8
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3255
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19005
# ##SampleSize*2 =        16
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 6
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5745
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        16159
# ##SampleSize*2 =        12
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 8
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5607
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        16430
# ##SampleSize*2 =        16
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 7
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4307
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17444
# ##SampleSize*2 =        14
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4858
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        17335
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :1865
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        20429
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 9
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3611
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        18409
# ##SampleSize*2 =        18
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2673
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number :        19582
# ##SampleSize*2 =        24
# Used [perl  ../bin/Plot_XX.pl ] to Plot the LDdecay

3.1.2. Plot using PopLDdecay

Get the files we need

for file in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ld_decay_results_list.txt

head -100 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ld_decay_results_list.txt
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ALD_LDdecay.stat.gz    ALD
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ALU_LDdecay.stat.gz    ALU
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ALV_LDdecay.stat.gz    ALV
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ARM_LDdecay.stat.gz    ARM
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/BAR_LDdecay.stat.gz    BAR
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/BRE_LDdecay.stat.gz    BRE
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/BUL_LDdecay.stat.gz    BUL
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/CES_LDdecay.stat.gz    CES
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/CRO_LDdecay.stat.gz    CRO
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/DES_LDdecay.stat.gz    DES
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/FRS_LDdecay.stat.gz    FRS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/GES_LDdecay.stat.gz    GES
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/GRA_LDdecay.stat.gz    GRA
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/GRC_LDdecay.stat.gz    GRC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ITP_LDdecay.stat.gz    ITP
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ITR_LDdecay.stat.gz    ITR
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/KER_LDdecay.stat.gz    KER
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/KRA_LDdecay.stat.gz    KRA
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/MAL_LDdecay.stat.gz    MAL
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/POP_LDdecay.stat.gz    POP
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/RAR_LDdecay.stat.gz    RAR
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ROS_LDdecay.stat.gz    ROS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SEV_LDdecay.stat.gz    SEV
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SIC_LDdecay.stat.gz    SIC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SLO_LDdecay.stat.gz    SLO
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SOC_LDdecay.stat.gz    SOC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SPB_LDdecay.stat.gz    SPB
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SPC_LDdecay.stat.gz    SPC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SPS_LDdecay.stat.gz    SPS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/STS_LDdecay.stat.gz    STS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TIK_LDdecay.stat.gz    TIK
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TRE_LDdecay.stat.gz    TRE
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TUA_LDdecay.stat.gz    TUA
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TUH_LDdecay.stat.gz    TUH

output/snps_sets/linkage/chr1/ALD_LDdecay.stat.gz ALD output/snps_sets/linkage/chr1/ALU_LDdecay.stat.gz ALU output/snps_sets/linkage/chr1/ALV_LDdecay.stat.gz ALV output/snps_sets/linkage/chr1/ARM_LDdecay.stat.gz ARM output/snps_sets/linkage/chr1/BAR_LDdecay.stat.gz BAR output/snps_sets/linkage/chr1/BRE_LDdecay.stat.gz BRE output/snps_sets/linkage/chr1/BUL_LDdecay.stat.gz BUL output/snps_sets/linkage/chr1/CES_LDdecay.stat.gz CES output/snps_sets/linkage/chr1/CRO_LDdecay.stat.gz CRO output/snps_sets/linkage/chr1/DES_LDdecay.stat.gz DES output/snps_sets/linkage/chr1/FRS_LDdecay.stat.gz FRS output/snps_sets/linkage/chr1/GES_LDdecay.stat.gz GES output/snps_sets/linkage/chr1/GRA_LDdecay.stat.gz GRA output/snps_sets/linkage/chr1/GRC_LDdecay.stat.gz GRC output/snps_sets/linkage/chr1/ITP_LDdecay.stat.gz ITP output/snps_sets/linkage/chr1/ITR_LDdecay.stat.gz ITR output/snps_sets/linkage/chr1/KER_LDdecay.stat.gz KER output/snps_sets/linkage/chr1/KRA_LDdecay.stat.gz KRA output/snps_sets/linkage/chr1/MAL_LDdecay.stat.gz MAL output/snps_sets/linkage/chr1/POP_LDdecay.stat.gz POP output/snps_sets/linkage/chr1/RAR_LDdecay.stat.gz RAR output/snps_sets/linkage/chr1/ROS_LDdecay.stat.gz ROS output/snps_sets/linkage/chr1/SEV_LDdecay.stat.gz SEV output/snps_sets/linkage/chr1/SIC_LDdecay.stat.gz SIC output/snps_sets/linkage/chr1/SLO_LDdecay.stat.gz SLO output/snps_sets/linkage/chr1/SOC_LDdecay.stat.gz SOC output/snps_sets/linkage/chr1/SPB_LDdecay.stat.gz SPB output/snps_sets/linkage/chr1/SPC_LDdecay.stat.gz SPC output/snps_sets/linkage/chr1/SPS_LDdecay.stat.gz SPS output/snps_sets/linkage/chr1/STS_LDdecay.stat.gz STS output/snps_sets/linkage/chr1/TIK_LDdecay.stat.gz TIK output/snps_sets/linkage/chr1/TRE_LDdecay.stat.gz TRE output/snps_sets/linkage/chr1/TUA_LDdecay.stat.gz TUA output/snps_sets/linkage/chr1/TUH_LDdecay.stat.gz TUH

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

module load Perl/5.36.1-GCCcore-12.2.0
module load Data::Dumper/2.173 
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl  \
      -in output/snps_sets/linkage/chr1/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr1/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.1.3. Plot using ggplot

library(ggrepel)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(RColorBrewer)
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

p

Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/decay_chr1.pdf", plot = p, width = 5, height = 8)

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/chr1.rds"
))

3.2. Chromosome 2

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1327468 70.9    3366742 179.9  2157587 115.3
## Vcells 2818950 21.6   16153512 123.3 18405414 140.5

3.2.1. run PopLDdecay

while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr2/ld1.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr2/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.2.2. Plot using PopLDdecay

Get the files we need

for file in output/snps_sets/linkage/chr2/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr2/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr2/ld_decay_results_list.txt

output/snps_sets/linkage/chr2/ALD_LDdecay.stat.gz ALD output/snps_sets/linkage/chr2/ALU_LDdecay.stat.gz ALU output/snps_sets/linkage/chr2/ALV_LDdecay.stat.gz ALV output/snps_sets/linkage/chr2/ARM_LDdecay.stat.gz ARM output/snps_sets/linkage/chr2/BAR_LDdecay.stat.gz BAR output/snps_sets/linkage/chr2/BRE_LDdecay.stat.gz BRE output/snps_sets/linkage/chr2/BUL_LDdecay.stat.gz BUL output/snps_sets/linkage/chr2/CES_LDdecay.stat.gz CES output/snps_sets/linkage/chr2/CRO_LDdecay.stat.gz CRO output/snps_sets/linkage/chr2/DES_LDdecay.stat.gz DES output/snps_sets/linkage/chr2/FRS_LDdecay.stat.gz FRS output/snps_sets/linkage/chr2/GES_LDdecay.stat.gz GES output/snps_sets/linkage/chr2/GRA_LDdecay.stat.gz GRA output/snps_sets/linkage/chr2/GRC_LDdecay.stat.gz GRC output/snps_sets/linkage/chr2/ITP_LDdecay.stat.gz ITP output/snps_sets/linkage/chr2/ITR_LDdecay.stat.gz ITR output/snps_sets/linkage/chr2/KER_LDdecay.stat.gz KER output/snps_sets/linkage/chr2/KRA_LDdecay.stat.gz KRA output/snps_sets/linkage/chr2/MAL_LDdecay.stat.gz MAL output/snps_sets/linkage/chr2/POP_LDdecay.stat.gz POP output/snps_sets/linkage/chr2/RAR_LDdecay.stat.gz RAR output/snps_sets/linkage/chr2/ROS_LDdecay.stat.gz ROS output/snps_sets/linkage/chr2/SEV_LDdecay.stat.gz SEV output/snps_sets/linkage/chr2/SIC_LDdecay.stat.gz SIC output/snps_sets/linkage/chr2/SLO_LDdecay.stat.gz SLO output/snps_sets/linkage/chr2/SOC_LDdecay.stat.gz SOC output/snps_sets/linkage/chr2/SPB_LDdecay.stat.gz SPB output/snps_sets/linkage/chr2/SPC_LDdecay.stat.gz SPC output/snps_sets/linkage/chr2/SPS_LDdecay.stat.gz SPS output/snps_sets/linkage/chr2/STS_LDdecay.stat.gz STS output/snps_sets/linkage/chr2/TIK_LDdecay.stat.gz TIK output/snps_sets/linkage/chr2/TRE_LDdecay.stat.gz TRE output/snps_sets/linkage/chr2/TUA_LDdecay.stat.gz TUA output/snps_sets/linkage/chr2/TUH_LDdecay.stat.gz TUH

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl  \
      -in output/snps_sets/linkage/chr2/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr2/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.2.3. Plot using ggplot

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

p

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/decay_chr2.pdf", plot = p, width = 5, height = 8)

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/chr2.rds"
))

3.3. Chromosome 3

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1327735 71.0    3366742 179.9  2320596 124.0
## Vcells 2820093 21.6   15571372 118.9 18405414 140.5

3.3.1. run PopLDdecay

while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr3/ld1.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr3/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.3.2. Plot using PopLDdecay

Get the files we need

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
for file in output/snps_sets/linkage/chr3/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr3/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr3/ld_decay_results_list.txt
## output/snps_sets/linkage/chr3/ALD_LDdecay.stat.gz    ALD
## output/snps_sets/linkage/chr3/ALU_LDdecay.stat.gz    ALU
## output/snps_sets/linkage/chr3/ALV_LDdecay.stat.gz    ALV
## output/snps_sets/linkage/chr3/ARM_LDdecay.stat.gz    ARM
## output/snps_sets/linkage/chr3/BAR_LDdecay.stat.gz    BAR
## output/snps_sets/linkage/chr3/BRE_LDdecay.stat.gz    BRE
## output/snps_sets/linkage/chr3/BUL_LDdecay.stat.gz    BUL
## output/snps_sets/linkage/chr3/CES_LDdecay.stat.gz    CES
## output/snps_sets/linkage/chr3/CRO_LDdecay.stat.gz    CRO
## output/snps_sets/linkage/chr3/DES_LDdecay.stat.gz    DES
## output/snps_sets/linkage/chr3/FRS_LDdecay.stat.gz    FRS
## output/snps_sets/linkage/chr3/GES_LDdecay.stat.gz    GES
## output/snps_sets/linkage/chr3/GRA_LDdecay.stat.gz    GRA
## output/snps_sets/linkage/chr3/GRC_LDdecay.stat.gz    GRC
## output/snps_sets/linkage/chr3/ITP_LDdecay.stat.gz    ITP
## output/snps_sets/linkage/chr3/ITR_LDdecay.stat.gz    ITR
## output/snps_sets/linkage/chr3/KER_LDdecay.stat.gz    KER
## output/snps_sets/linkage/chr3/KRA_LDdecay.stat.gz    KRA
## output/snps_sets/linkage/chr3/MAL_LDdecay.stat.gz    MAL
## output/snps_sets/linkage/chr3/POP_LDdecay.stat.gz    POP
## output/snps_sets/linkage/chr3/RAR_LDdecay.stat.gz    RAR
## output/snps_sets/linkage/chr3/ROS_LDdecay.stat.gz    ROS
## output/snps_sets/linkage/chr3/SEV_LDdecay.stat.gz    SEV
## output/snps_sets/linkage/chr3/SIC_LDdecay.stat.gz    SIC
## output/snps_sets/linkage/chr3/SLO_LDdecay.stat.gz    SLO
## output/snps_sets/linkage/chr3/SOC_LDdecay.stat.gz    SOC
## output/snps_sets/linkage/chr3/SPB_LDdecay.stat.gz    SPB
## output/snps_sets/linkage/chr3/SPC_LDdecay.stat.gz    SPC
## output/snps_sets/linkage/chr3/SPS_LDdecay.stat.gz    SPS
## output/snps_sets/linkage/chr3/STS_LDdecay.stat.gz    STS
## output/snps_sets/linkage/chr3/TIK_LDdecay.stat.gz    TIK
## output/snps_sets/linkage/chr3/TRE_LDdecay.stat.gz    TRE
## output/snps_sets/linkage/chr3/TUA_LDdecay.stat.gz    TUA
## output/snps_sets/linkage/chr3/TUH_LDdecay.stat.gz    TUH

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
      -in output/snps_sets/linkage/chr3/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr3/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.3.3. Plot using ggplot

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

#p

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/decay_chr3.pdf", plot = p, width = 5, height = 8)

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/chr3.rds"
))

4. Plot comparing chromosomes

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1769332 94.5    3366742 179.9  2320596 124.0
## Vcells 6059018 46.3   25471977 194.4 24446946 186.6

Create data frame with the data from each chromosome

Import the data

# Chromosome 1
chr1 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr1 = half_distance
) |>
  mutate(chr1 = chr1/1000)

# Chromosome 2
chr2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr2 = half_distance
) |>
  mutate(chr2 = chr2/1000)

# Chromosome 3
chr3 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr3 = half_distance
) |>
  mutate(chr3 = chr3/1000)

# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")

# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)

# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)

# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")

Make one plot

# Define a custom color palette
custom_palette <- c(
  "Chr1" = "orange", 
  "Chr2" = "green3", 
  "Chr3" = "blue"
)

# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))

# source the plotting function
source(here("analyses", "my_theme2.R"))

# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
  labs(title = "", y = "Half Distance (kb)", x = "Population") +
  geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 3) +
  my_theme() +
  coord_flip() +
  scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
  theme(legend.position = "top",
        plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_comparison.pdf", width = 6, height = 12)

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
head(sampling_loc)
##       Pop_City    Country  Latitude Longitude Continent Abbreviation
## 1  Franceville      Gabon  -1.59207  13.53242    Africa          GAB
## 2 Antananarivo Madagascar -18.87920  47.50790    Africa          ANT
## 3  Diego ville Madagascar -12.27361  49.29372    Africa          DGV
## 4    Morondava Madagascar -20.28420  44.27940    Africa          MAD
## 5     Vohimasy Madagascar -22.81591  47.75026    Africa          VOH
## 6      Dauguet  Mauritius -20.18530  57.52154    Africa          DAU
##          Year         Region    Subregion order
## 1        2015 Central Africa       Africa    72
## 2        2022    East Africa  East Africa    76
## 3        2022    East Africa  East Africa    77
## 4        2016    East Africa  East Africa    78
## 5 2016 & 2017    East Africa  East Africa    79
## 6        2022   Indian Ocean Indian Ocean    80
head(data_long)
##   population  chr value
## 1        ALD Chr1  52.3
## 2        ALU Chr1  24.8
## 3        ALV Chr1  17.8
## 4        ARM Chr1  23.0
## 5        BAR Chr1  71.7
## 6        BRE Chr1  28.2
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
  left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
##   population  chr value  Pop_City Country Latitude Longitude Continent Year
## 1        ALD Chr1  52.3    Durres Albania 41.29704  19.50373    Europe 2018
## 2        ALU Chr1  24.8   Alushta Ukraine 44.68289  34.40368    Europe 2021
## 3        ALV Chr1  17.8     Vlore Albania 40.46600  19.48970    Europe 2020
## 4        ARM Chr1  23.0    Ijevan Armenia 40.87971  45.14764    Europe 2020
## 5        BAR Chr1  71.7 Barcelona   Spain 41.38510   2.17340    Europe 2018
## 6        BRE Chr1  28.2   Brescia   Italy 45.53373  10.20445    Europe 1995
##            Region   Subregion order
## 1 Southern Europe East Europe    25
## 2  Eastern Europe East Europe    35
## 3 Southern Europe East Europe    24
## 4  Eastern Europe East Europe    42
## 5 Southern Europe West Europe     8
## 6 Southern Europe West Europe    12

Add the name of the city/countries to the plot

# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")

# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)

# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)


# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
  labs(title = "", y = "Half Distance (kb)", x = "") +
  geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
  my_theme() +
  coord_flip() +
  labs(title = "", y = "Half Distance (kb)", x = "Population") +
  scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
  theme(legend.position = "top",
        plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
        axis.text.y = element_text(size = 7, angle = 0, hjust = 1))

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_comp_poplabels.pdf", width = 6, height = 12)
# Create table
dist_table <- distance2 |>
  dplyr::select(
    population, Pop_City, Country, chr, value
  ) |>
  dplyr::rename(
    City = Pop_City,
    Population = population
  )

# Spread and arrange
dist_table_condensed <- dist_table %>%
  spread(key = chr, value = value) %>%
  dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
  dplyr::arrange(Country, City)


# Create table
ft <- flextable::flextable(dist_table_condensed)


# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)

# Show it
ft

Population

Country

City

Chr1

Chr2

Chr3

ALD

Albania

Durres

52.3

109.4

47.2

ALV

Albania

Vlore

17.8

0.5

79.9

ARM

Armenia

Ijevan

23.0

83.5

104.5

BUL

Bulgaria

Lom

81.7

75.0

164.1

CRO

Croatia

Dubrovnik

23.8

6.0

119.0

FRS

France

Saint-Martin-d'Heres

1.9

43.2

59.2

STS

France

Strasbourg

65.2

136.3

177.8

GES

Georgia

Sakhumi, Abkhazia

36.3

75.0

139.2

GRA

Greece

Athens

21.9

61.4

2.7

GRC

Greece

Chania

79.8

161.1

87.9

BRE

Italy

Brescia

28.2

57.0

90.1

CES

Italy

Cesena

6.2

93.2

37.3

DES

Italy

Desenzano

29.5

50.9

46.1

ITP

Italy

Puglia

134.5

166.2

290.9

ITR

Italy

Rome (Trappola)

23.0

42.4

42.4

SIC

Italy

Sicilia

82.3

126.1

124.2

TRE

Italy

Trentino

36.4

36.1

35.3

MAL

Malta

Luqa

9.0

33.5

86.4

POP

Portugal

Penafiel

10.5

30.3

25.0

ROS

Romania

Satu Mare

13.6

63.1

83.6

RAR

Russia

Armavir

24.8

115.2

73.9

KRA

Russia

Krasnodar

31.2

15.4

50.3

SOC

Russia

Sochi

40.6

100.8

77.9

TIK

Russia

Tikhoretsk

14.2

75.9

163.9

SLO

Slovenia

Ajdovscina

20.3

13.9

34.8

SPB

Spain

Badajoz

4.3

56.1

55.8

BAR

Spain

Barcelona

71.7

3.5

61.0

SPC

Spain

Catarroja

45.4

86.2

162.9

SPS

Spain

San Roque

90.2

374.2

42.8

TUA

Turkey

Aliaga

97.6

182.0

162.5

TUH

Turkey

Hopa

4.7

0.3

100.6

ALU

Ukraine

Alushta

24.8

77.8

64.4

KER

Ukraine

Kerch, Crimea

47.4

110.9

82.4

SEV

Ukraine

Sevastopol, Crimea

11.3

64.5

78.1

# Create a new Word document
doc <- read_docx()

# Add the flextable
doc <- body_add_flextable(doc, value = ft)

# Save the document to a file
# Define the path for saving the Word document
file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_albopictus.docx")

print(doc, target = file_path)

Get Counts

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
## 
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC ITP ITR KER KRA MAL POP 
##  10  12  12  10  12  13  10  14  12  16  12  12  11  10   8  12  12  12  12  12 
## RAR ROS SEV SIC SLO SOC SPB SPC SPS STS TIK TRE TUA TUH 
##  12  11  12   9  12  12   8   6   8   7  12  12   9  12

Merge the data

# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")

# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")

Check correlation

# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
  geom_point() +
  geom_smooth(method = "lm", col = "orange") +
  ggtitle("Correlation between Count and Chr1") +
  labs(x = "Count", y = "LD Half Distance - Chr1")

# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
  geom_point() +
  geom_smooth(method = "lm", col = "green3") +
  ggtitle("Correlation between Count and Chr2") +
  labs(x = "Count", y = "LD Half Distance - Chr2")

# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  ggtitle("Correlation between Count and Chr3") +
  labs(x = "Count", y = "LD Half Distance - Chr3")

# Display the plots
print(plot_Chr1)
## `geom_smooth()` using formula = 'y ~ x'

print(plot_Chr2)
## `geom_smooth()` using formula = 'y ~ x'

print(plot_Chr3)
## `geom_smooth()` using formula = 'y ~ x'

# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]

# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]

# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]

# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.3023191 , p-value = 0.0007558585
cat("For Chr2: R2 =", R2_Chr2, ", p-value =", p_value_Chr2, "\n")
## For Chr2: R2 = 0.2743179 , p-value = 0.001478025
cat("For Chr3: R2 =", R2_Chr3, ", p-value =", p_value_Chr3, "\n")
## For Chr3: R2 = 0.2574468 , p-value = 0.002192244
# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]

# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]

# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]

# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.3023191 , p-value = 0.0007558585
cat("For Chr2: R2 =", R2_Chr2, ", p-value =", p_value_Chr2, "\n")
## For Chr2: R2 = 0.2743179 , p-value = 0.001478025
cat("For Chr3: R2 =", R2_Chr3, ", p-value =", p_value_Chr3, "\n")
## For Chr3: R2 = 0.2574468 , p-value = 0.002192244

Annotate the plots

# Helper function to extract coefficients, R2, and create the label
get_annotation <- function(model) {
  coefs <- coef(model)
  eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
  r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
  return(paste(eq, r2, sep = "\n"))
}

# Annotations for each chromosome
annotate_Chr1 <- get_annotation(model_Chr1)
annotate_Chr2 <- get_annotation(model_Chr2)
annotate_Chr3 <- get_annotation(model_Chr3)

# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
  ggplot(data, aes(x = Count, y = !!sym(yvar))) +
    geom_point() +
    geom_smooth(method = "lm", col = "orange") +
    annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
    labs(x = "Count", y = yvar)
}

plot1 <- plot_with_annotation(merged_data, "Chr1", annotate_Chr1)

# Display the plots with annotations
print(plot1)
## `geom_smooth()` using formula = 'y ~ x'

# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
  ggplot(data, aes(x = Count, y = !!sym(yvar))) +
    geom_point() +
    geom_smooth(method = "lm", col = "green3") +
    annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
    labs(x = "Count", y = yvar)
}
plot2 <- plot_with_annotation(merged_data, "Chr2", annotate_Chr2)
print(plot2)
## `geom_smooth()` using formula = 'y ~ x'

plot_with_annotation <- function(data, yvar, label) {
  ggplot(data, aes(x = Count, y = !!sym(yvar))) +
    geom_point() +
    geom_smooth(method = "lm", col = "blue") +
    annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
    labs(x = "Count", y = yvar)
}

plot3 <- plot_with_annotation(merged_data, "Chr3", annotate_Chr3)
print(plot3)
## `geom_smooth()` using formula = 'y ~ x'

# Read the data
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "City", "Count"), variable.name = "Chromosome")

saveRDS(merged_data_long, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/merged_data_long.rds"
))

Create a facet plot

merged_data_long <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/merged_data_long.rds"))


# Function to compute the linear model details
get_lm_details <- function(data, yvar) {
  model <- lm(data[[yvar]] ~ data$Count)
  coefs <- coef(model)
  eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
  r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
  return(tibble(Chromosome = yvar, Equation = eq, R2 = r2))
}

# Compute details for each chromosome and bind rows
annotations <- bind_rows(get_lm_details(merged_data, "Chr1"), 
                         get_lm_details(merged_data, "Chr2"), 
                         get_lm_details(merged_data, "Chr3"))

# Compute maximum y for annotations for each chromosome
annotations <- annotations %>%
  left_join(merged_data_long %>% group_by(Chromosome) %>% summarise(MaxY = max(value, na.rm = TRUE)), by = "Chromosome")

# Plotting with annotations correctly positioned in the top right corner
facet_plot <- ggplot(merged_data_long, aes(x = Count, y = value)) +
  geom_point() +
  geom_smooth(method = "lm", col = "black") +
  geom_text(data = annotations, aes(label = paste(Equation, R2, sep = "\n"), y = MaxY), 
            x = max(merged_data$Count), hjust = 1, vjust = 1) +
  facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
  labs(x = "Number of Samples", y = "LD Half Distance (kb)") +
  my_theme()

print(facet_plot)
## `geom_smooth()` using formula = 'y ~ x'

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_sample_size.pdf"), width = 4, height = 6)

Plot all countries

# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
  group_by(Country, Chromosome) %>%
  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")

# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = value)) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(aes(color = Country), width = 0.2) +
  geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 3) + # Annotate mean value
  facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
  theme_minimal() +
  labs(title = "", y = "LD Half Distance (kb)") +
  my_theme() +
  coord_flip() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none")

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_mean_by_country.pdf"), width = 6, height = 6)

Legend: The Linkage Disequilibrium (LD) half distance values in kilobases (kb) for the Aedes albopictus mosquito across countries, grouped by chromosome. Each boxplot displays the interquartile range of the LD Half Distance values, with the horizontal line in the box marking the median. Colored jittered points represent individual data points, with each color corresponding to a specific country. The plot also annotates each country-chromosome combination’s mean LD half distance. Distinct facets separate the values for different chromosomes to offer clear visualization.

Use ggstatplot to test if there is any significant differences Chr1

library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(multcompView)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr1, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr1,
  title = "Chr1",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_chr1_test.pdf"), width = 7, height = 6)

Chr2

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr2, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr2,
  title = "Chr2",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_chr2_test.pdf"), width = 7, height = 6)

Chr3

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr3, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr3,
  title = "Chr3",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_chr3_test.pdf"), width = 7, height = 6)

Global dataset.

1. Load libraries

library(tidyverse)
library(here)

2. Prepare the data

Mosquitoes removed due excess heterozygosity

tail -n +2 /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/het_fail_ind.txt
## QNC 1248
## KAT 601
## KAT 605
## KAT 606
## KAT 608
## GRA 734
## TUA 783
## ITP 832
## ROS 858

Mosquitoes removed duo to relatedness

head -n 12 /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/file7.king.cutoff.out.id
## #FID IID
## SIC  1231
## SIC  1235
## SIC  1236
## KAN  1327
## DES  1445
## REC  179
## ITB  748
## SPM  767
## TUA  779
## TUA  780
## SPS  914

Create a list of individuals we removed

echo 'SIC   1231
SIC 1235
SIC 1236
KAN 1327
DES 1445
REC 179
ITB 748
SPM 767
TUA 779
TUA 780
SPS 914' > /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/remove.txt
module load PLINK/2_avx2_20221024

We can start with file7 (copied to the new linkage directory), because in the QC Markdown we created this file using the chromosomal scale, and it already has been filtered for relatedness, heterozygosity, missingness, etc.

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 16
## FRS 12
## GEL 2
## GES 12
## GRA 11
## GRC 10
## GRV 12
## HAI 12
## HAN 4
## HOC 7
## HUN 12
## IMP 4
## INJ 11
## INW 4
## ITB 5
## ITP 9
## ITR 12
## JAF 2
## KAC 6
## KAG 12
## KAN 11
## KAT 6
## KER 12
## KLP 4
## KRA 12
## KUN 4
## LAM 9
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POL 2
## POP 12
## QNC 11
## RAR 12
## REC 11
## ROM 4
## ROS 11
## SER 4
## SEV 12
## SIC 9
## SLO 12
## SOC 12
## SON 3
## SPB 8
## SPC 6
## SPM 5
## SPS 8
## SSK 12
## STS 12
## SUF 6
## SUU 6
## TAI 7
## TIK 12
## TIR 4
## TRE 12
## TUA 9
## TUH 12
## UTS 12
## YUN 9

Create list of populations

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 6 {print}' | awk '{print $1}' > output/snps_sets/linkage/pops_4ld.txt;
head -n100 output/snps_sets/linkage/pops_4ld.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## GRV
## HAI
## HOC
## HUN
## INJ
## ITP
## ITR
## KAC
## KAG
## KAN
## KAT
## KER
## KRA
## LAM
## MAL
## MAT
## OKI
## PAL
## POP
## QNC
## RAR
## REC
## ROS
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPS
## SSK
## STS
## SUF
## SUU
## TAI
## TIK
## TRE
## TUA
## TUH
## UTS
## YUN

3. Chromosomal level LD estimates for global dataset

We will use MAF 0.01 for each chromosome (SNP Set 3)

We can estimate LD for each chromosome separated.

Import the .bim file with the SNPs to create a new chromosomal scale.

#   import the bim file with the SNP data                                   ####
snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/file7.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm Position Allele1 Allele2
##   <chr>      <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1          AX-583035163     0   315386 A       G      
## 2 1          AX-583033356     0   315674 C       T      
## 3 1          AX-583033370     0   330057 G       A      
## 4 1          AX-583035194     0   330265 A       G      
## 5 1          AX-583033387     0   331288 C       T      
## 6 1          AX-583035257     0   442875 T       C

Separate the tibbles into each chromosome.

# chr1
chr1_snps <-
  snps |>
  filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(Chromosome == 2) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(Chromosome == 3) |>
  as_tibble()

We have objects with the SNPs for each chromosome

head(chr1_snps)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm Position Allele1 Allele2
##   <chr>      <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1          AX-583035163     0   315386 A       G      
## 2 1          AX-583033356     0   315674 C       T      
## 3 1          AX-583033370     0   330057 G       A      
## 4 1          AX-583035194     0   330265 A       G      
## 5 1          AX-583033387     0   331288 C       T      
## 6 1          AX-583035257     0   442875 T       C

Filter the data by chromosome

#Chromosome 1
write.table(
  chr1_snps$SNP,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_ld_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

#Chromosome 2

write.table(
  chr2_snps$SNP,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_ld_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

#Chromosome 3

write.table(
  chr3_snps$SNP,
  file      = here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_ld_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Now create new files for each chromosome

Create dirs

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global

# make directory
mkdir -p output/snps_sets/linkage/chr1;
mkdir -p output/snps_sets/linkage/chr2;
mkdir -p output/snps_sets/linkage/chr3;

Chromosome 1

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/file7 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr1_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr1/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr1/ld1.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –extract: 19362 variants remaining. –keep-fam: 637 samples remaining. 637 samples (80 females, 60 males, 497 ambiguous; 637 founders) remaining after 0 variants removed due to allele frequency threshold(s) 19362 variants remaining after main filters.

Chromosome 2

plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/file7 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr2_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr2/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr2/ld1.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –extract: 36611 variants remaining. –keep-fam: 637 samples remaining. 637 samples (80 females, 60 males, 497 ambiguous; 637 founders) remaining after 0 variants removed due to allele frequency threshold(s) 36611 variants remaining after main filters.

Chromosome 3

plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/file7 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr3_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr3/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr3/ld1.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –extract: 31210 variants remaining. –keep-fam: 637 samples remaining. 637 samples (80 females, 60 males, 497 ambiguous; 637 founders) remaining after 0 variants removed due to allele frequency threshold(s) 31210 variants remaining after main filters.

Check the vcfs

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
ls output/snps_sets/linkage/chr*/ld1.vcf
## output/snps_sets/linkage/chr1/ld1.vcf
## output/snps_sets/linkage/chr2/ld1.vcf
## output/snps_sets/linkage/chr3/ld1.vcf

3.1. Chromosome 1

3.1.1. run PopLDdecay

Prepare the files required by PopLDdecay

module spider bcftools
#BCFtools/1.16-GCCcore-10.2.0
module load BCFtools/1.16-GCCcore-10.2.0
bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf > output/snps_sets/linkage/vcf_samples.txt

bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf | cut -d'_' -f1 | sort | uniq > output/snps_sets/linkage/unique_pops_from_vcf.txt
while read pop; do
    grep "^${pop}_" output/snps_sets/linkage/vcf_samples.txt > output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr1/ld1.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr1/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.1.2. Plot using PopLDdecay

Get the files we need

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
for file in output/snps_sets/linkage/chr1/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr1/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr1/ld_decay_results_list.txt

Exit the conda environment and load R

module load R/4.3.0-foss-2020b
module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl  \
      -in output/snps_sets/linkage/chr1/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr1/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.1.3. Plot using ggplot

# Define the path to the .fam file using here
library(ggrepel)
library(RColorBrewer)

fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")

library(scales)

# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

#p
#If you get an "Error in grid.Call...", the figure is too big for you viewer. Just save it as a pdf instead and open the pdf to view.
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr1.pdf"), 
  width = 14,
  height = 16,
  units = "in"
)

Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/chr1.rds"
))

3.2. Chromosome 2

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4312232 230.3    7322801 391.1  4966314 265.3
## Vcells 12977215  99.1   47019476 358.8 45802637 349.5
module load BCFtools/1.16-GCCcore-10.2.0
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0

3.2.1. run PopLDdecay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr2/ld1.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr2/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.2.2. Plot using PopLDdecay

Get the files we need

for file in output/snps_sets/linkage/chr2/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr2/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr2/ld_decay_results_list.txt

Plot using PopLDdecay. It will create files that we can use to plot with ggplot


perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl  \
      -in output/snps_sets/linkage/chr2/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr2/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.2.3. Plot using ggplot

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")

# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

#p
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr2.pdf"), 
  width = 14,
  height = 16,
  units = "in"
)

Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/chr2.rds"
))

3.3. Chromosome 3

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4312317 230.4    7322801 391.1  5619349 300.2
## Vcells 13039134  99.5   47019476 358.8 47019323 358.8
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0

3.3.1. run PopLDdecay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr3/ld1.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr3/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.3.2. Plot using PopLDdecay

Get the files we need

for file in output/snps_sets/linkage/chr3/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr3/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr3/ld_decay_results_list.txt

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
      -in output/snps_sets/linkage/chr3/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr3/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.3.3. Plot using ggplot

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")

# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

#p
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr3.pdf"), 
  width = 14,
  height = 16,
  units = "in"
)

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/chr3.rds"
))

4. Plot comparing chromosomes

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4312385 230.4    7322801 391.1  5619349 300.2
## Vcells 13040010  99.5   47019476 358.8 47019323 358.8

Create data frame with the data from each chromosome

Import the data

# Chromosome 1
chr1 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr1 = half_distance
) |>
  mutate(chr1 = chr1/1000)

# Chromosome 2
chr2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr2 = half_distance
) |>
  mutate(chr2 = chr2/1000)

# Chromosome 3
chr3 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr3 = half_distance
) |>
  mutate(chr3 = chr3/1000)

# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")

# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)

# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)

# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")

Make one plot

# Define a custom color palette
custom_palette <- c(
  "Chr1" = "orange", 
  "Chr2" = "green3", 
  "Chr3" = "blue"
)

# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))

# source the plotting function
source(here("analyses", "my_theme2.R"))

# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
  labs(title = "", y = "Half Distance (kb)", x = "Population") +
  geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
  my_theme() +
  coord_flip() +
  scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
  theme(legend.position = "top",
        plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/ld/decay_comparison.pdf", width = 8, height = 14)

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
head(sampling_loc)
##       Pop_City    Country  Latitude Longitude Continent Abbreviation
## 1  Franceville      Gabon  -1.59207  13.53242    Africa          GAB
## 2 Antananarivo Madagascar -18.87920  47.50790    Africa          ANT
## 3  Diego ville Madagascar -12.27361  49.29372    Africa          DGV
## 4    Morondava Madagascar -20.28420  44.27940    Africa          MAD
## 5     Vohimasy Madagascar -22.81591  47.75026    Africa          VOH
## 6      Dauguet  Mauritius -20.18530  57.52154    Africa          DAU
##          Year         Region    Subregion order
## 1        2015 Central Africa       Africa    72
## 2        2022    East Africa  East Africa    76
## 3        2022    East Africa  East Africa    77
## 4        2016    East Africa  East Africa    78
## 5 2016 & 2017    East Africa  East Africa    79
## 6        2022   Indian Ocean Indian Ocean    80
head(data_long)
##   population  chr value
## 1        ALD Chr1  40.7
## 2        ALU Chr1   9.9
## 3        ALV Chr1  36.7
## 4        ARM Chr1  14.2
## 5        BAR Chr1  20.3
## 6        BEN Chr1   6.3
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
  left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
##   population  chr value  Pop_City Country Latitude Longitude Continent    Year
## 1        ALD Chr1  40.7    Durres Albania 41.29704  19.50373    Europe    2018
## 2        ALU Chr1   9.9   Alushta Ukraine 44.68289  34.40368    Europe    2021
## 3        ALV Chr1  36.7     Vlore Albania 40.46600  19.48970    Europe    2020
## 4        ARM Chr1  14.2    Ijevan Armenia 40.87971  45.14764    Europe    2020
## 5        BAR Chr1  20.3 Barcelona   Spain 41.38510   2.17340    Europe    2018
## 6        BEN Chr1   6.3 Bengaluru   India 12.97160  77.59460      Asia Unknown
##            Region   Subregion order
## 1 Southern Europe East Europe    25
## 2  Eastern Europe East Europe    35
## 3 Southern Europe East Europe    24
## 4  Eastern Europe East Europe    42
## 5 Southern Europe West Europe     8
## 6      South Asia                52

Add the name of the city/countries to the plot

# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")

# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)

# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)


# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
  labs(title = "", y = "Half Distance (kb)", x = "") +
  geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
  my_theme() +
  coord_flip() +
  labs(title = "", y = "Half Distance (kb)", x = "Population") +
  scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
  theme(legend.position = "top",
        plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
        axis.text.y = element_text(size = 5, angle = 0, hjust = 1))

Save

ggsave("scripts/RMarkdowns/output/euro_global/ld/decay_comp_poplabels.pdf", width = 8, height = 14)
# Create table
dist_table <- distance2 |>
  dplyr::select(
    population, Pop_City, Country, chr, value
  ) |>
  dplyr::rename(
    City = Pop_City,
    Population = population
  )

# Spread and arrange
dist_table_condensed <- dist_table %>%
  spread(key = chr, value = value) %>%
  dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
  dplyr::arrange(Country, City)


# Create table
ft <- flextable::flextable(dist_table_condensed)


# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)

# Show it
ft

Population

Country

City

Chr1

Chr2

Chr3

ALD

Albania

Durres

40.7

72.3

47.2

ALV

Albania

Vlore

36.7

2.1

81.2

ARM

Armenia

Ijevan

14.2

53.4

37.8

GRV

Brazil

Gravatai

26.8

183.0

64.1

REC

Brazil

Recife, PE

5.4

141.4

89.2

BUL

Bulgaria

Lom

57.9

95.5

57.0

CAM

Cambodia

Phnom Penh

16.7

70.2

0.9

HAI

China

Hainan

1.5

71.5

4.6

HUN

China

Hunan

12.1

57.9

45.7

YUN

China

Yunnan

6.7

153.8

1.4

CRO

Croatia

Dubrovnik

23.1

1.6

34.2

FRS

France

Saint-Martin-d'Heres

2.1

25.5

56.9

STS

France

Strasbourg

33.3

39.5

88.6

GES

Georgia

Sakhumi, Abkhazia

5.6

94.3

120.4

GRA

Greece

Athens

32.2

40.3

0.5

GRC

Greece

Chania

62.0

141.4

87.9

BEN

India

Bengaluru

6.3

116.5

53.2

INJ

Indonesia

Jakarta

8.0

90.8

58.8

SUF

Indonesia

Sulawesi (Forest)

5.9

115.2

170.9

SUU

Indonesia

Sulawesi (Urban)

35.3

200.2

196.1

BRE

Italy

Brescia

31.6

58.1

68.4

CES

Italy

Cesena

6.2

93.2

34.4

DES

Italy

Desenzano

27.3

52.9

36.8

ITP

Italy

Puglia

27.4

2.9

120.1

ITR

Italy

Rome (Trappola)

40.1

51.1

36.8

SIC

Italy

Sicilia

32.4

52.9

124.8

TRE

Italy

Trentino

26.5

36.7

35.3

KAG

Japan

Kagoshima

26.7

17.6

20.6

KAN

Japan

Kanazawa

23.0

36.3

30.6

OKI

Japan

Okinawa

25.3

21.4

77.7

UTS

Japan

Utsunomiya

35.2

14.6

32.2

MAT

Malaysia

Tambun

25.5

182.4

45.3

MAL

Malta

Luqa

36.7

33.5

32.2

POP

Portugal

Penafiel

20.8

30.3

35.9

ROS

Romania

Satu Mare

34.1

36.9

83.6

RAR

Russia

Armavir

21.9

70.8

31.6

KRA

Russia

Krasnodar

31.2

15.4

50.3

SOC

Russia

Sochi

45.3

103.8

60.8

TIK

Russia

Tikhoretsk

0.3

75.9

116.2

SLO

Slovenia

Ajdovscina

14.7

2.5

31.8

SPB

Spain

Badajoz

4.3

56.1

58.8

BAR

Spain

Barcelona

20.3

1.7

61.0

SPC

Spain

Catarroja

46.3

99.8

116.1

SPS

Spain

San Roque

48.9

325.9

289.3

TAI

Taiwan

Tainan

9.9

135.8

149.1

CHA

Thailand

Chanthaburi

8.4

84.4

2.0

KAC

Thailand

Kanchanaburi

30.1

49.9

45.4

LAM

Thailand

Lampang

4.5

225.8

159.3

SSK

Thailand

Sisaket

11.6

84.6

40.7

TUA

Turkey

Aliaga

75.4

154.6

69.3

TUH

Turkey

Hopa

5.8

0.3

65.2

BER

USA

Berlin, NJ

0.3

33.5

33.6

PAL

USA

Palm Beach

13.2

66.9

0.3

ALU

Ukraine

Alushta

9.9

5.0

54.2

KER

Ukraine

Kerch, Crimea

5.6

66.2

61.2

SEV

Ukraine

Sevastopol, Crimea

5.1

70.8

66.0

HOC

Vietnam

Ho Chi Minh City

54.6

127.4

85.0

QNC

Vietnam

Quy Nhon City

3.4

82.5

6.6

# Create a new Word document
doc <- read_docx()

# Add the flextable
doc <- body_add_flextable(doc, value = ft)

# Save the document to a file
# Define the path for saving the Word document
file_path <- here("euro_global/output/snps_sets/linkage/decay_albopictus.docx")

print(doc, target = file_path)

Get Counts

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/ld1.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
## 
## ALD ALU ALV ARM BAR BEN BER BRE BUL CAM CES CHA CRO DES FRS GES GRA GRC GRV HAI 
##  10  12  12  10  12  12  12  13  10  12  14  12  12  16  12  12  11  10  12  12 
## HOC HUN INJ ITP ITR KAC KAG KAN KAT KER KRA LAM MAL MAT OKI PAL POP QNC RAR REC 
##   7  12  11   9  12   6  12  11   6  12  12   9  12  12  12  11  12  11  12  11 
## ROS SEV SIC SLO SOC SPB SPC SPS SSK STS SUF SUU TAI TIK TRE TUA TUH UTS YUN 
##  11  12   9  12  12   8   6   8  12  12   6   6   7  12  12   9  12  12   9

Merge the data

# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")

# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")

Check correlation

# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
  geom_point() +
  geom_smooth(method = "lm", col = "orange") +
  ggtitle("Correlation between Count and Chr1") +
  labs(x = "Count", y = "LD Half Distance - Chr1")

# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
  geom_point() +
  geom_smooth(method = "lm", col = "green3") +
  ggtitle("Correlation between Count and Chr2") +
  labs(x = "Count", y = "LD Half Distance - Chr2")

# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  ggtitle("Correlation between Count and Chr3") +
  labs(x = "Count", y = "LD Half Distance - Chr3")

# Display the plots
print(plot_Chr1)
print(plot_Chr2)
print(plot_Chr3)

Calculate correlations

# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]

# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]

# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]

# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.06363568 , p-value = 0.05608641

Correlation for chromosome 2

cat("For Chr2: R2 =", R2_Chr2, ", p-value =", p_value_Chr2, "\n")
## For Chr2: R2 = 0.1866813 , p-value = 0.0007071597

Correlation for chromosome 3

cat("For Chr3: R2 =", R2_Chr3, ", p-value =", p_value_Chr3, "\n")
## For Chr3: R2 = 0.300802 , p-value = 8.30986e-06

Annotate the plots

# Helper function to extract coefficients, R2, and create the label
get_annotation <- function(model) {
  coefs <- coef(model)
  eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
  r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
  return(paste(eq, r2, sep = "\n"))
}

# Annotations for each chromosome
annotate_Chr1 <- get_annotation(model_Chr1)
annotate_Chr2 <- get_annotation(model_Chr2)
annotate_Chr3 <- get_annotation(model_Chr3)

# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
  ggplot(data, aes(x = Count, y = !!sym(yvar))) +
    geom_point() +
    geom_smooth(method = "lm", col = "orange") +
    annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
    labs(x = "Count", y = yvar)
}

plot1 <- plot_with_annotation(merged_data, "Chr1", annotate_Chr1)

# Display the plots with annotations
print(plot1)
## `geom_smooth()` using formula = 'y ~ x'

# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
  ggplot(data, aes(x = Count, y = !!sym(yvar))) +
    geom_point() +
    geom_smooth(method = "lm", col = "green3") +
    annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
    labs(x = "Count", y = yvar)
}
plot2 <- plot_with_annotation(merged_data, "Chr2", annotate_Chr2)
print(plot2)
## `geom_smooth()` using formula = 'y ~ x'

plot_with_annotation <- function(data, yvar, label) {
  ggplot(data, aes(x = Count, y = !!sym(yvar))) +
    geom_point() +
    geom_smooth(method = "lm", col = "blue") +
    annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
    labs(x = "Count", y = yvar)
}

plot3 <- plot_with_annotation(merged_data, "Chr3", annotate_Chr3)
print(plot3)
## `geom_smooth()` using formula = 'y ~ x'

# Read the data
library(data.table)
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "City", "Count"), variable.name = "Chromosome")

saveRDS(merged_data_long, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long.rds"
))

Create a facet plot

merged_data_long <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long.rds"))


# Function to compute the linear model details
get_lm_details <- function(data, yvar) {
  model <- lm(data[[yvar]] ~ data$Count)
  coefs <- coef(model)
  eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
  r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
  return(tibble(Chromosome = yvar, Equation = eq, R2 = r2))
}

# Compute details for each chromosome and bind rows
annotations <- bind_rows(get_lm_details(merged_data, "Chr1"), 
                         get_lm_details(merged_data, "Chr2"), 
                         get_lm_details(merged_data, "Chr3"))

# Compute maximum y for annotations for each chromosome
annotations <- annotations %>%
  left_join(merged_data_long %>% group_by(Chromosome) %>% summarise(MaxY = max(value, na.rm = TRUE)), by = "Chromosome")

# Plotting with annotations correctly positioned in the top right corner
facet_plot <- ggplot(merged_data_long, aes(x = Count, y = value)) +
  geom_point() +
  geom_smooth(method = "lm", col = "black") +
  geom_text(data = annotations, aes(label = paste(Equation, R2, sep = "\n"), y = MaxY), 
            x = max(merged_data$Count), hjust = 1, vjust = 1) +
  facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
  labs(x = "Number of Samples", y = "LD Half Distance (kb)") +
  my_theme()

print(facet_plot)
## `geom_smooth()` using formula = 'y ~ x'

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_sample_size.pdf"), width = 4, height = 6)

Plot all countries

# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
  group_by(Country, Chromosome) %>%
  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")

# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = value)) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(aes(color = Country), width = 0.2) +
  geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 2) + # Annotate mean value
  facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
  theme_minimal() +
  labs(title = "", y = "LD Half Distance (kb)") +
  my_theme() +
  coord_flip() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none") +
  theme(axis.text.y = element_text(size = 5, angle = 0, hjust = 1))

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_mean_by_country.pdf"), width = 6, height = 10)
x <- unique(merged_data$Country)
x
##  [1] "Albania"   "Ukraine"   "Armenia"   "Spain"     "India"     "USA"      
##  [7] "Italy"     "Bulgaria"  "Cambodia"  "Thailand"  "Croatia"   "France"   
## [13] "Georgia"   "Greece"    "Brazil"    "China"     "Vietnam"   "Indonesia"
## [19] "Japan"     "Russia"    "Malta"     "Malaysia"  "Portugal"  "Romania"  
## [25] "Slovenia"  "Taiwan"    "Turkey"
library(Polychrome)
P27 = createPalette(27,  c("#ff0000", "#00ff00", "#0000ff"))
swatch(P27)

color_palette_27 <- lapply(split(P27, names(P27)), unname)

Use ggstatplot to test if there is any significant differences Chr1

library(ggstatsplot)

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr1, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr1,
  title = "Chr1",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  #centrality.plotting = FALSE,
  #centrality.point.args = list(size = 3, color = "darkred"),
  #centrality.label.args = NULL,
  #package = "RColorBrewer",
  #palette = "Dark2",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_chr1_test.pdf"), width = 10, height = 8)

Chr2

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr2, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr2,
  title = "Chr2",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr2_test.pdf"), width = 10, height =8)

Chr3

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr3, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr3,
  title = "Chr3",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr3_test.pdf"), width = 10, height = 8)

Re-do with same # of individuals in each population.

1. Prepare the data

We can start with file7 (copied to the new linkage directory), because in the QC Markdown we created this file using the chromosomal scale, and it already has been filtered for relatedness, heterozygosity, missingness, etc.

List pops

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 16
## FRS 12
## GEL 2
## GES 12
## GRA 11
## GRC 10
## GRV 12
## HAI 12
## HAN 4
## HOC 7
## HUN 12
## IMP 4
## INJ 11
## INW 4
## ITB 5
## ITP 9
## ITR 12
## JAF 2
## KAC 6
## KAG 12
## KAN 11
## KAT 6
## KER 12
## KLP 4
## KRA 12
## KUN 4
## LAM 9
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POL 2
## POP 12
## QNC 11
## RAR 12
## REC 11
## ROM 4
## ROS 11
## SER 4
## SEV 12
## SIC 9
## SLO 12
## SOC 12
## SON 3
## SPB 8
## SPC 6
## SPM 5
## SPS 8
## SSK 12
## STS 12
## SUF 6
## SUU 6
## TAI 7
## TIK 12
## TIR 4
## TRE 12
## TUA 9
## TUH 12
## UTS 12
## YUN 9

2. We need to remove pops with less than 10 individuals:

List pops with 10 or more

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 10 {print}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 16
## FRS 12
## GES 12
## GRA 11
## GRC 10
## GRV 12
## HAI 12
## HUN 12
## INJ 11
## ITR 12
## KAG 12
## KAN 11
## KER 12
## KRA 12
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POP 12
## QNC 11
## RAR 12
## REC 11
## ROS 11
## SEV 12
## SLO 12
## SOC 12
## SSK 12
## STS 12
## TIK 12
## TRE 12
## TUH 12
## UTS 12

We can use the command above but only print the list of families, and create a new file (pops_4ld_2.txt) for the LD analysis.

Create list of populations

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/file7.fam  | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 10 {print}' | awk '{print $1}' > output/snps_sets/linkage/pops_4ld_2.txt;
head -n100 output/snps_sets/linkage/pops_4ld_2.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## GRV
## HAI
## HUN
## INJ
## ITR
## KAG
## KAN
## KER
## KRA
## MAL
## MAT
## OKI
## PAL
## POP
## QNC
## RAR
## REC
## ROS
## SEV
## SLO
## SOC
## SSK
## STS
## TIK
## TRE
## TUH
## UTS

We also want to remove American pops (just want native and Europe pops) Create a list of individuals we removed

echo 'BER
GRV
PAL
REC' > /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/populations_to_remove.txt
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage
input_file="pops_4ld_2.txt"
pops_to_remove_file="populations_to_remove.txt"
temp_file=$(mktemp)
grep -vFf "$pops_to_remove_file" "$input_file" > "$temp_file"
mv "$temp_file" "$input_file"

Check

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
head -n100 output/snps_sets/linkage/pops_4ld_2.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## GRV
## HAI
## HUN
## INJ
## ITR
## KAG
## KAN
## KER
## KRA
## MAL
## MAT
## OKI
## PAL
## POP
## QNC
## RAR
## REC
## ROS
## SEV
## SLO
## SOC
## SSK
## STS
## TIK
## TRE
## TUH
## UTS

Now remove random individuals for pops with >10 individuals, so that all pops have only 10. We also want to save a list of the individuals that we are removing.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage
input_file="file7.fam"
num_individuals=10
temp_file=$(mktemp)
removed_ind_file="removed_ind.txt" ## Specify the file to save removed individuals
shuf "$input_file" > "$temp_file"
#awk -v num="$num_individuals" '{a[$1]++} a[$1]<=num' "$temp_file" > "10ind_per_pop.txt"
awk -v num="$num_individuals" '{
    if (!(a[$1]++ >= num)) {
        print $0
    } else {
        print $0 >> "'"$removed_ind_file"'"
    }
}' "$temp_file" > "10ind_per_pop.txt"

rm "$temp_file"

Check output file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/10ind_per_pop.txt | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 10 {print}'
## ALD 10
## ALU 10
## ALV 10
## ARM 10
## BAR 10
## BEN 10
## BER 10
## BRE 10
## BUL 10
## CAM 10
## CES 10
## CHA 10
## CRO 10
## DES 10
## FRS 10
## GES 10
## GRA 10
## GRC 10
## GRV 10
## HAI 10
## HUN 10
## INJ 10
## ITR 10
## KAG 10
## KAN 10
## KER 10
## KRA 10
## MAL 10
## MAT 10
## OKI 10
## PAL 10
## POP 10
## QNC 10
## RAR 10
## REC 10
## ROS 10
## SEV 10
## SLO 10
## SOC 10
## SSK 10
## STS 10
## TIK 10
## TRE 10
## TUH 10
## UTS 10

To actually remove the individuals not in the “10ind_per_pop.txt” file from the bed file, use the “removed_ind_file” file with the list of individuals that we removed

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
plink2 \
--allow-extra-chr \
--remove output/snps_sets/linkage/removed_ind.txt \
--bfile output/snps_sets/linkage/file7 \
--make-bed \
--out output/snps_sets/linkage/ld2 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/ld2.log

688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –remove: 606 samples remaining. 606 samples (72 females, 58 males, 476 ambiguous; 606 founders) remaining after

3. Chromosomoal level LD estimates

We will use MAF 0.005 for each chromosome (default from PopLDdecay) Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 3733605 199.4    7322801 391.1  7322801 391.1
## Vcells 7645024  58.4   30092465 229.6 47019323 358.8

We can estimate LD for each chromosome separated.

Import the .bim file with the SNPs to create a new chromosomal scale. Now use ld1 (rather than file7) because it has pruned the individuals so no pop has >10 individuals

snps <-
  read_delim(                    # to learn about the options use here, run ?read_delim on the console.
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/ld2.bim"
    ),                           # use library here to load it
    col_names      = FALSE,      # we don't have header in the input file
    show_col_types = FALSE,      # suppress message from read_delim
    col_types      = "ccidcc"    # set the class of each column
  )
#
# set column names
colnames(
  snps
) <-                             # to add a header in our tibble
  c(
    "Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
  )
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm Position Allele1 Allele2
##   <chr>      <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1          AX-583035163     0   315386 A       G      
## 2 1          AX-583033356     0   315674 C       T      
## 3 1          AX-583033370     0   330057 G       A      
## 4 1          AX-583035194     0   330265 A       G      
## 5 1          AX-583033387     0   331288 C       T      
## 6 1          AX-583035257     0   442875 T       C

Separate the tibbles into each chromosome.

# chr1
chr1_snps <-
  snps |>
  filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
  as_tibble() # save as tibble
#
# chr2
chr2_snps <-
  snps |>
  filter(Chromosome == 2) |>
  as_tibble()
#
# chr3
chr3_snps <-
  snps |>
  filter(Chromosome == 3) |>
  as_tibble()

We have objects with the SNPs for each chromosome

head(chr1_snps)
## # A tibble: 6 × 6
##   Chromosome SNP             Cm Position Allele1 Allele2
##   <chr>      <chr>        <int>    <dbl> <chr>   <chr>  
## 1 1          AX-583035163     0   315386 A       G      
## 2 1          AX-583033356     0   315674 C       T      
## 3 1          AX-583033370     0   330057 G       A      
## 4 1          AX-583035194     0   330265 A       G      
## 5 1          AX-583033387     0   331288 C       T      
## 6 1          AX-583035257     0   442875 T       C

Filter the data by chromosome Chromosome 1

write.table(
  chr1_snps$SNP,
  file      = here(
    "euro_global/output/snps_sets/linkage/chr1_ld2_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Chromosome 2

write.table(
  chr2_snps$SNP,
  file      = here(
    "euro_global/output/snps_sets/linkage/chr2_ld2_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Chromosome 3

write.table(
  chr3_snps$SNP,
  file      = here(
    "euro_global/output/snps_sets/linkage/chr3_ld2_SNPs.txt"
  ),
  sep       = "\t",
  row.names = FALSE,
  col.names = FALSE,
  quote     = FALSE
)

Now create new files for each chromosome

cd /gpfs/gibbs/pi/caccone/mkc54/albo
mkdir -p euro_global/output/snps_sets/linkage/chr1_2;
mkdir -p euro_global/output/snps_sets/linkage/chr2_2;
mkdir -p euro_global/output/snps_sets/linkage/chr3_2;

Chromosome 1

cd /gpfs/gibbs/pi/caccone/mkc54/albo

plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/linkage/ld2 \
--keep-fam euro_global/output/snps_sets/linkage/pops_4ld_2.txt \
--extract euro_global/output/snps_sets/linkage/chr1_ld2_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out euro_global/output/snps_sets/linkage/chr1_2/ld2 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/snps_sets/linkage/chr1_2/ld2.log

606 samples (72 females, 58 males, 476 ambiguous; 606 founders) loaded from 87183 variants loaded from euro_global/output/snps_sets/linkage/ld1.bim. –extract: 19362 variants remaining. –keep-fam: 410 samples remaining. 410 samples (35 females, 41 males, 334 ambiguous; 410 founders) remaining after 0 variants removed due to allele frequency threshold(s) 19362 variants remaining after main filters.

Chromosome 2

plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/linkage/ld2 \
--keep-fam euro_global/output/snps_sets/linkage/pops_4ld_2.txt \
--extract euro_global/output/snps_sets/linkage/chr2_ld2_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out euro_global/output/snps_sets/linkage/chr2_2/ld2 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/snps_sets/linkage/chr2_2/ld2.log

606 samples (72 females, 58 males, 476 ambiguous; 606 founders) loaded from 87183 variants loaded from euro_global/output/snps_sets/linkage/ld1.bim. –extract: 36611 variants remaining. –keep-fam: 410 samples remaining. 410 samples (35 females, 41 males, 334 ambiguous; 410 founders) remaining after 0 variants removed due to allele frequency threshold(s) 36611 variants remaining after main filters.

Chromosome 3

plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/linkage/ld2 \
--keep-fam euro_global/output/snps_sets/linkage/pops_4ld_2.txt \
--extract euro_global/output/snps_sets/linkage/chr3_ld2_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out euro_global/output/snps_sets/linkage/chr3_2/ld2 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/snps_sets/linkage/chr3_2/ld2.log

606 samples (72 females, 58 males, 476 ambiguous; 606 founders) loaded from 87183 variants loaded from euro_global/output/snps_sets/linkage/ld1.bim. –extract: 31210 variants remaining. –keep-fam: 410 samples remaining. 410 samples (35 females, 41 males, 334 ambiguous; 410 founders) remaining after 0 variants removed due to allele frequency threshold(s) 31210 variants remaining after main filters.

Check the vcfs

ls /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr*/ld2.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/ld2.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/ld2.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.vcf

3.1 Chromosome 1

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 3733774 199.5    7322801 391.1  7322801 391.1
## Vcells 7641871  58.4   30092465 229.6 47019323 358.8

3.1.1 run PopLDdecay

module spider bcftools
#BCFtools/1.16-GCCcore-10.2.0
module load BCFtools/1.16-GCCcore-10.2.0

Prepare the files required by PopLDdecay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
bcftools query -l output/snps_sets/linkage/chr1_2/ld2.vcf > output/snps_sets/linkage/vcf_samples.txt

bcftools query -l output/snps_sets/linkage/chr1_2/ld2.vcf | cut -d'_' -f1 | sort | uniq > output/snps_sets/linkage/unique_pops_from_vcf.txt
while read pop; do
    grep "^${pop}_" output/snps_sets/linkage/vcf_samples.txt > output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr1_2/ld2.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr1_2/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.1.2. Plot using PopLDdecay

Get the files we need

for file in output/snps_sets/linkage/chr1_2/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr1_2/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr1_2/ld_decay_results_list.txt

Exit the conda environment and load R

module load R/4.3.0-foss-2020b

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl  \
      -in output/snps_sets/linkage/chr1_2/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr1_2/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.1.3. Plot using ggplot

library(ggrepel)
library(RColorBrewer)

fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/ld2.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")

library(scales)

# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr1_2.pdf"), 
  width = 14,
  height = 16,
  units = "in"
)

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/chr1.rds"
))

Load the data for chrom 1

chr1 <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/chr1.rds"
))

3.2. Chromosome 2

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4139154 221.1    7322801 391.1  7322801 391.1
## Vcells 11408873  87.1   38032589 290.2 47019323 358.8
module load BCFtools/1.16-GCCcore-10.2.0
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0

3.2.1. run PopLDdecay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr2_2/ld2.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr2_2/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.2.2. Plot using PopLDdecay

Get the files we need

for file in output/snps_sets/linkage/chr2_2/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr2_2/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr2_2/ld_decay_results_list.txt

Exit the conda environment and load R

module load R/4.3.0-foss-2020b

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global

perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl  \
      -in output/snps_sets/linkage/chr2_2/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr2_2/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.2.3. Plot using ggplot

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/ld2.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")

# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

#p
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr2_2.pdf"), 
  width = 14,
  height = 16,
  units = "in"
)

Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/chr2.rds"
))

Load the data for chrom 2

chr2 <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/chr2.rds"
))

3.3. Chromosome 3

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4139267 221.1    7322801 391.1  7322801 391.1
## Vcells 11410075  87.1   38032589 290.2 47019323 358.8
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin

#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0

3.3.1. run PopLDdecay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
    PopLDdecay -InVCF output/snps_sets/linkage/chr3_2/ld2.vcf \
               -OutType 4 \
               -MaxDist 1000 \
               -MAF 0.01 \
               -OutFilterSNP \
               -OutStat output/snps_sets/linkage/chr3_2/${pop}_LDdecay.gz \
               -SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt

3.3.2 Plot using PopLDdecay

Get the files we need

for file in output/snps_sets/linkage/chr3_2/*_LDdecay.stat.gz; do 
    pop_name=$(basename $file _LDdecay.stat.gz)
    echo "$file    $pop_name"
done > output/snps_sets/linkage/chr3_2/ld_decay_results_list.txt

head -100 output/snps_sets/linkage/chr3_2/ld_decay_results_list.txt

Exit the conda environment and load R

module load R/4.3.0-foss-2020b

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help

Plot using PopLDdecay. It will create files that we can use to plot with ggplot

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global

perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
      -in output/snps_sets/linkage/chr3_2/ld_decay_results_list.txt \
      -output output/snps_sets/linkage/chr3_2/all_pops \
      -bin1 10 \
      -bin2 100 \
      -break 100 \
      -maxX 1000 \
      -measure r2 \
      -method MeanBin \
      -keepR

3.3.3. Plot using ggplot

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Extract the family names from the first column
populations <- unique(fam_data$V1)

# Determine the number of unique populations
num_populations <- length(unique(populations))

# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")

# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)

# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Initialize an empty list to store data frames
data_frames <- list()

# Read data from each population file and store it in the list
for (pop in populations) {
  file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/all_pops.", pop, sep = "")
  data <- read.table(file_path)
  data$population <- pop  # Add a column for population name
  data_frames[[pop]] <- data
}

# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)

# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)

# Calculate half distances
half_distances <- combined_data |>
  group_by(population) |>
  mutate(max_r2 = max(V2)) |>
  filter(V2 <= max_r2 / 2) |>
  arrange(V1) |>
  slice(1) |>
  select(population, half_distance = V1)

# Calculate the maximum V2 value for each population
max_values <- combined_data |>
  group_by(population) |>
  summarize(max_V2 = max(V2))

# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")

# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
  geom_point(size = .3, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
  geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
  geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")), 
            color = "blue", vjust = -1, size = 3) +
  labs(
    title = "",
    x = "Distance(Kb)",
    y = expression(r^2)
  ) +
  scale_color_manual(values = color_mapping) +
  scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
  scale_x_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.title.x = element_text(face = "bold", size = 14),
    axis.title.y = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold")
  ) +
  facet_wrap(~ population, scales = "free_y", ncol = 2) +
  coord_cartesian(xlim = c(0, 1000))

#p
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr3_2.pdf"), 
  width = 14,
  height = 16,
  units = "in"
)
ggsave(
  filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr3_2.png"), 
  width = 14,
  height = 16,
  units = "in",
  bg = "white"
)

Save the data to use later

saveRDS(half_distances, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/chr3.rds"
))

Load the data for chrom 3

chr3 <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/chr3.rds"
))

4. Plot comparing chromosomes

Clean env and memory

# Remove all objects from the environment
rm(list = ls())

# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  4139392 221.1    7322801 391.1  7322801 391.1
## Vcells 11411216  87.1   38032589 290.2 47019323 358.8

Create data frame with the data from each chromosome

Import the data

# Chromosome 1
chr1 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr1 = half_distance
) |>
  mutate(chr1 = chr1/1000)

# Chromosome 2
chr2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr2 = half_distance
) |>
  mutate(chr2 = chr2/1000)

# Chromosome 3
chr3 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
  chr3 = half_distance
) |>
  mutate(chr3 = chr3/1000)

# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")

# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)

# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)

# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")

Make one plot

# Define a custom color palette
custom_palette <- c(
  "Chr1" = "orange", 
  "Chr2" = "green3", 
  "Chr3" = "blue"
)

# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))

# source the plotting function
source(here("analyses", "my_theme2.R"))

# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
  labs(title = "", y = "Half Distance (kb)", x = "Population") +
  geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
  my_theme() +
  coord_flip() +
  scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
  theme(legend.position = "top",
        plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))

Save

ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/ld/decay_comparison_2.pdf", width = 8, height = 14)

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
head(sampling_loc)
##       Pop_City    Country  Latitude Longitude Continent Abbreviation
## 1  Franceville      Gabon  -1.59207  13.53242    Africa          GAB
## 2 Antananarivo Madagascar -18.87920  47.50790    Africa          ANT
## 3  Diego ville Madagascar -12.27361  49.29372    Africa          DGV
## 4    Morondava Madagascar -20.28420  44.27940    Africa          MAD
## 5     Vohimasy Madagascar -22.81591  47.75026    Africa          VOH
## 6      Dauguet  Mauritius -20.18530  57.52154    Africa          DAU
##          Year         Region    Subregion order
## 1        2015 Central Africa       Africa    72
## 2        2022    East Africa  East Africa    76
## 3        2022    East Africa  East Africa    77
## 4        2016    East Africa  East Africa    78
## 5 2016 & 2017    East Africa  East Africa    79
## 6        2022   Indian Ocean Indian Ocean    80
head(data_long)
##   population  chr value
## 1        ALD Chr1  40.7
## 2        ALU Chr1   5.6
## 3        ALV Chr1  19.5
## 4        ARM Chr1  14.2
## 5        BAR Chr1  20.3
## 6        BEN Chr1  18.9
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
  left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
##   population  chr value  Pop_City Country Latitude Longitude Continent    Year
## 1        ALD Chr1  40.7    Durres Albania 41.29704  19.50373    Europe    2018
## 2        ALU Chr1   5.6   Alushta Ukraine 44.68289  34.40368    Europe    2021
## 3        ALV Chr1  19.5     Vlore Albania 40.46600  19.48970    Europe    2020
## 4        ARM Chr1  14.2    Ijevan Armenia 40.87971  45.14764    Europe    2020
## 5        BAR Chr1  20.3 Barcelona   Spain 41.38510   2.17340    Europe    2018
## 6        BEN Chr1  18.9 Bengaluru   India 12.97160  77.59460      Asia Unknown
##            Region   Subregion order
## 1 Southern Europe East Europe    25
## 2  Eastern Europe East Europe    35
## 3 Southern Europe East Europe    24
## 4  Eastern Europe East Europe    42
## 5 Southern Europe West Europe     8
## 6      South Asia                52

Add the name of the city/countries to the plot

# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")

# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)

# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)


# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
  labs(title = "", y = "Half Distance (kb)", x = "") +
  geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
  my_theme() +
  coord_flip() +
  labs(title = "", y = "Half Distance (kb)", x = "Population") +
  scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
  theme(legend.position = "top",
        plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
        axis.text.y = element_text(size = 5, angle = 0, hjust = 1))

Save

ggsave("scripts/RMarkdowns/output/euro_global/ld/decay_comp_poplabels_2.pdf", width = 8, height = 14)
# Create table
dist_table <- distance2 |>
  dplyr::select(
    population, Pop_City, Country, chr, value
  ) |>
  dplyr::rename(
    City = Pop_City,
    Population = population
  )

# Spread and arrange
dist_table_condensed <- dist_table %>%
  spread(key = chr, value = value) %>%
  dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
  dplyr::arrange(Country, City)


# Create table
ft <- flextable::flextable(dist_table_condensed)


# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)

# Show it
ft

Population

Country

City

Chr1

Chr2

Chr3

ALD

Albania

Durres

40.7

72.3

47.2

ALV

Albania

Vlore

19.5

2.5

81.2

ARM

Armenia

Ijevan

14.2

53.4

37.8

BUL

Bulgaria

Lom

57.9

95.5

57.0

CAM

Cambodia

Phnom Penh

9.9

216.9

4.6

HAI

China

Hainan

29.5

71.5

35.6

HUN

China

Hunan

21.8

3.6

60.6

CRO

Croatia

Dubrovnik

49.9

1.6

85.0

FRS

France

Saint-Martin-d'Heres

3.1

25.5

70.5

STS

France

Strasbourg

26.6

74.2

88.6

GES

Georgia

Sakhumi, Abkhazia

5.6

109.6

79.1

GRA

Greece

Athens

19.5

3.3

0.5

GRC

Greece

Chania

62.0

141.4

87.9

BEN

India

Bengaluru

18.9

112.6

53.2

INJ

Indonesia

Jakarta

8.0

107.0

65.4

BRE

Italy

Brescia

21.3

57.0

72.0

CES

Italy

Cesena

20.3

3.5

44.2

DES

Italy

Desenzano

5.6

64.5

108.7

ITR

Italy

Rome (Trappola)

16.1

57.4

36.8

TRE

Italy

Trentino

12.2

42.6

56.4

KAG

Japan

Kagoshima

28.0

30.3

35.6

KAN

Japan

Kanazawa

23.0

41.0

39.1

OKI

Japan

Okinawa

9.1

20.4

161.5

UTS

Japan

Utsunomiya

35.4

13.2

32.2

MAT

Malaysia

Tambun

11.4

246.1

104.3

MAL

Malta

Luqa

25.3

53.4

81.4

POP

Portugal

Penafiel

48.7

39.1

57.0

ROS

Romania

Satu Mare

13.6

69.0

51.4

RAR

Russia

Armavir

24.8

110.3

88.5

KRA

Russia

Krasnodar

37.0

1.8

99.6

SOC

Russia

Sochi

34.1

129.5

82.6

TIK

Russia

Tikhoretsk

5.6

165.6

155.7

SLO

Slovenia

Ajdovscina

20.3

53.4

77.7

BAR

Spain

Barcelona

20.3

1.7

75.2

CHA

Thailand

Chanthaburi

12.3

129.5

24.5

SSK

Thailand

Sisaket

12.3

39.4

40.7

TUH

Turkey

Hopa

20.3

0.3

81.2

ALU

Ukraine

Alushta

5.6

61.3

64.4

KER

Ukraine

Kerch, Crimea

5.6

110.9

92.5

SEV

Ukraine

Sevastopol, Crimea

5.1

70.8

66.0

QNC

Vietnam

Quy Nhon City

3.4

82.5

6.6

library(flextable)
library(officer)
# Create a new Word document
doc <- read_docx()

# Add the flextable
doc <- body_add_flextable(doc, value = ft)

# Save the document to a file
# Define the path for saving the Word document
file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_albopictus_2.docx")

print(doc, target = file_path)

4.1. Get Counts

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
## 
## ALD ALU ALV ARM BAR BEN BRE BUL CAM CES CHA CRO DES FRS GES GRA GRC HAI HUN INJ 
##  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10 
## ITR KAG KAN KER KRA MAL MAT OKI POP QNC RAR ROS SEV SLO SOC SSK STS TIK TRE TUH 
##  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10 
## UTS 
##  10

Merge the data

# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")

# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")

Check to make sure all pops have 10 for each chromosome

# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
  geom_point() +
  geom_smooth(method = "lm", col = "orange") +
  ggtitle("Correlation between Count and Chr1") +
  labs(x = "Count", y = "LD Half Distance - Chr1")

# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
  geom_point() +
  geom_smooth(method = "lm", col = "green3") +
  ggtitle("Correlation between Count and Chr2") +
  labs(x = "Count", y = "LD Half Distance - Chr2")

# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  ggtitle("Correlation between Count and Chr3") +
  labs(x = "Count", y = "LD Half Distance - Chr3")

# Display the plots
print(plot_Chr1)
## `geom_smooth()` using formula = 'y ~ x'

There is no variation in the number of individuals, since we filtered out all pops with <10 and removed individuals for pops with >10.

# Read the data
library(data.table)
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "City", "Count"), variable.name = "Chromosome")
saveRDS(merged_data_long, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long_2.rds"
))

4.2 Plot all countries

# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
  group_by(Country, Chromosome) %>%
  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")

# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = value)) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(aes(color = Country), width = 0.2) +
  geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 2) + # Annotate mean value
  facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
  theme_minimal() +
  labs(title = "", y = "LD Half Distance (kb)") +
  my_theme() +
  coord_flip() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none") +
  theme(axis.text.y = element_text(size = 5, angle = 0, hjust = 1))

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_mean_by_country.pdf"), width = 6, height = 10)

4.3. Differences

x <- unique(merged_data$Country)
x
##  [1] "Albania"   "Ukraine"   "Armenia"   "Spain"     "India"     "Italy"    
##  [7] "Bulgaria"  "Cambodia"  "Thailand"  "Croatia"   "France"    "Georgia"  
## [13] "Greece"    "China"     "Indonesia" "Japan"     "Russia"    "Malta"    
## [19] "Malaysia"  "Portugal"  "Vietnam"   "Romania"   "Slovenia"  "Turkey"
library(Polychrome)
P27 = createPalette(27,  c("#ff0000", "#00ff00", "#0000ff"))
swatch(P27)

color_palette_27 <- lapply(split(P27, names(P27)), unname)

Use ggstatplot to test if there is any significant differences Chr1

library(ggstatsplot)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr1, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr1,
  title = "Chr1",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  #centrality.plotting = FALSE,
  #centrality.point.args = list(size = 3, color = "darkred"),
  #centrality.label.args = NULL,
  #package = "RColorBrewer",
  #palette = "Dark2",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_chr1_test.pdf"), width = 10, height = 8)

Chr2

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr2, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr2,
  title = "Chr2",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr2_test.pdf"), width = 10, height =8)

Chr3

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Country = fct_reorder(Country, Chr3, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Country,
  y = Chr3,
  title = "Chr3",
  xlab = "Country",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr3_test.pdf"), width = 10, height = 8)

Add region to merged_data

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
head(sampling_loc)
##       Pop_City    Country  Latitude Longitude Continent Abbreviation
## 1  Franceville      Gabon  -1.59207  13.53242    Africa          GAB
## 2 Antananarivo Madagascar -18.87920  47.50790    Africa          ANT
## 3  Diego ville Madagascar -12.27361  49.29372    Africa          DGV
## 4    Morondava Madagascar -20.28420  44.27940    Africa          MAD
## 5     Vohimasy Madagascar -22.81591  47.75026    Africa          VOH
## 6      Dauguet  Mauritius -20.18530  57.52154    Africa          DAU
##          Year         Region    Subregion order
## 1        2015 Central Africa       Africa    72
## 2        2022    East Africa  East Africa    76
## 3        2022    East Africa  East Africa    77
## 4        2016    East Africa  East Africa    78
## 5 2016 & 2017    East Africa  East Africa    79
## 6        2022   Indian Ocean Indian Ocean    80
head(data_long)
##   population  chr value
## 1        ALD Chr1  40.7
## 2        ALU Chr1   5.6
## 3        ALV Chr1  19.5
## 4        ARM Chr1  14.2
## 5        BAR Chr1  20.3
## 6        BEN Chr1  18.9
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
  left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
##   population  chr value  Pop_City Country Latitude Longitude Continent    Year
## 1        ALD Chr1  40.7    Durres Albania 41.29704  19.50373    Europe    2018
## 2        ALU Chr1   5.6   Alushta Ukraine 44.68289  34.40368    Europe    2021
## 3        ALV Chr1  19.5     Vlore Albania 40.46600  19.48970    Europe    2020
## 4        ARM Chr1  14.2    Ijevan Armenia 40.87971  45.14764    Europe    2020
## 5        BAR Chr1  20.3 Barcelona   Spain 41.38510   2.17340    Europe    2018
## 6        BEN Chr1  18.9 Bengaluru   India 12.97160  77.59460      Asia Unknown
##            Region   Subregion order
## 1 Southern Europe East Europe    25
## 2  Eastern Europe East Europe    35
## 3 Southern Europe East Europe    24
## 4  Eastern Europe East Europe    42
## 5 Southern Europe West Europe     8
## 6      South Asia                52
# Create table
dist_table <- distance2 |>
  dplyr::select(
    population, Pop_City, Country, Continent, chr, value
  ) |>
  dplyr::rename(
    City = Pop_City,
    Population = population
  )

# Spread and arrange
dist_table_condensed <- dist_table %>%
  spread(key = chr, value = value) %>%
  dplyr::select(Population, City, Country, Continent, Chr1, Chr2, Chr3) |>
  dplyr::arrange(Continent, Country, City)


# Create table
ft <- flextable::flextable(dist_table_condensed)


# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)

# Show it
ft

Population

City

Country

Continent

Chr1

Chr2

Chr3

CAM

Phnom Penh

Cambodia

Asia

9.9

216.9

4.6

HAI

Hainan

China

Asia

29.5

71.5

35.6

HUN

Hunan

China

Asia

21.8

3.6

60.6

BEN

Bengaluru

India

Asia

18.9

112.6

53.2

INJ

Jakarta

Indonesia

Asia

8.0

107.0

65.4

KAG

Kagoshima

Japan

Asia

28.0

30.3

35.6

KAN

Kanazawa

Japan

Asia

23.0

41.0

39.1

OKI

Okinawa

Japan

Asia

9.1

20.4

161.5

UTS

Utsunomiya

Japan

Asia

35.4

13.2

32.2

MAT

Tambun

Malaysia

Asia

11.4

246.1

104.3

CHA

Chanthaburi

Thailand

Asia

12.3

129.5

24.5

SSK

Sisaket

Thailand

Asia

12.3

39.4

40.7

QNC

Quy Nhon City

Vietnam

Asia

3.4

82.5

6.6

ALD

Durres

Albania

Europe

40.7

72.3

47.2

ALV

Vlore

Albania

Europe

19.5

2.5

81.2

ARM

Ijevan

Armenia

Europe

14.2

53.4

37.8

BUL

Lom

Bulgaria

Europe

57.9

95.5

57.0

CRO

Dubrovnik

Croatia

Europe

49.9

1.6

85.0

FRS

Saint-Martin-d'Heres

France

Europe

3.1

25.5

70.5

STS

Strasbourg

France

Europe

26.6

74.2

88.6

GES

Sakhumi, Abkhazia

Georgia

Europe

5.6

109.6

79.1

GRA

Athens

Greece

Europe

19.5

3.3

0.5

GRC

Chania

Greece

Europe

62.0

141.4

87.9

BRE

Brescia

Italy

Europe

21.3

57.0

72.0

CES

Cesena

Italy

Europe

20.3

3.5

44.2

DES

Desenzano

Italy

Europe

5.6

64.5

108.7

ITR

Rome (Trappola)

Italy

Europe

16.1

57.4

36.8

TRE

Trentino

Italy

Europe

12.2

42.6

56.4

MAL

Luqa

Malta

Europe

25.3

53.4

81.4

POP

Penafiel

Portugal

Europe

48.7

39.1

57.0

ROS

Satu Mare

Romania

Europe

13.6

69.0

51.4

RAR

Armavir

Russia

Europe

24.8

110.3

88.5

KRA

Krasnodar

Russia

Europe

37.0

1.8

99.6

SOC

Sochi

Russia

Europe

34.1

129.5

82.6

TIK

Tikhoretsk

Russia

Europe

5.6

165.6

155.7

SLO

Ajdovscina

Slovenia

Europe

20.3

53.4

77.7

BAR

Barcelona

Spain

Europe

20.3

1.7

75.2

TUH

Hopa

Turkey

Europe

20.3

0.3

81.2

ALU

Alushta

Ukraine

Europe

5.6

61.3

64.4

KER

Kerch, Crimea

Ukraine

Europe

5.6

110.9

92.5

SEV

Sevastopol, Crimea

Ukraine

Europe

5.1

70.8

66.0

library(officer)
library(flextable)
# Create a new Word document
doc <- read_docx()

# Add the flextable
doc <- body_add_flextable(doc, value = ft)

# Save the document to a file
# Define the path for saving the Word document
file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_albopictus_withCont_2.docx")

print(doc, target = file_path)

4.4. Get Counts

# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.fam")

# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)

# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
## 
## ALD ALU ALV ARM BAR BEN BRE BUL CAM CES CHA CRO DES FRS GES GRA GRC HAI HUN INJ 
##  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10 
## ITR KAG KAN KER KRA MAL MAT OKI POP QNC RAR ROS SEV SLO SOC SSK STS TIK TRE TUH 
##  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10  10 
## UTS 
##  10

Merge the data

# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")

# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")
# Read the data
library(data.table)
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "Continent", "City", "Count"), variable.name = "Chromosome")
saveRDS(merged_data_long, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long_2.rds"
))

Calc mean countries

# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
  group_by(Continent, Country, Chromosome) %>%
  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")

Plot by continent

# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Continent, y = value)) +
  geom_boxplot(alpha = 0.5) +
  geom_jitter(aes(color = Continent), width = 0.2) +
  geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 2) + # Annotate mean value
  facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
  theme_minimal() +
  labs(title = "", y = "LD Half Distance (kb)") +
  my_theme() +
  coord_flip() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none") +
  theme(axis.text.y = element_text(size = 5, angle = 0, hjust = 1))

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_mean_by_continent_2.pdf"), width = 6, height = 10)

Use ggstatplot to test if there is any significant differences Chr1

library(ggstatsplot)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Continent = fct_reorder(Continent, Chr1, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Continent,
  y = Chr1,
  title = "Chr1",
  xlab = "Continent",
  ylab = "LD Half Distance (kb)",
  #centrality.plotting = FALSE,
  #centrality.point.args = list(size = 3, color = "darkred"),
  #centrality.label.args = NULL,
  #package = "RColorBrewer",
  #palette = "Dark2",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")

# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_chr1_test_continent_2.pdf"), width = 10, height = 8)

Chr2

# Reorder the levels of Country based on the median of Chr1
#merged_data <- merged_data %>%
 # mutate(Continent = fct_reorder(Continent, Chr2, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Continent,
  y = Chr2,
  title = "Chr2",
  xlab = "Continent",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr2_test_continent_2.pdf"), width = 10, height =8)

Chr3

# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
  mutate(Continent = fct_reorder(Continent, Chr3, .fun = median))

# Plot
ggbetweenstats(
  data = merged_data,
  x = Continent,
  y = Chr3,
  title = "Chr3",
  xlab = "Continent",
  ylab = "LD Half Distance (kb)",
  ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")

# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr3_test_continent_2.pdf"), width = 10, height = 8)