## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.5.1 ✔ purrr 1.0.2
## ✔ tibble 3.2.1 ✔ dplyr 1.1.4
## ✔ tidyr 1.3.1 ✔ stringr 1.5.1
## ✔ readr 2.1.5 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## here() starts at /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns
Prepare the data
Mosquitoes removed due excess heterozygosity
## GRA 734
## TUA 783
## ITP 829
## ITP 832
## ROS 858
Mosquitoes removed duo to relatedness
## #FID IID
## SIC 1231
## SIC 1235
## SIC 1236
## DES 1445
## ITB 748
## SPM 767
## TUA 779
## TUA 780
## SPS 914
Create a list of individuals to remove
echo 'GRA 734
TUA 783
ITP 829
ITP 832
ROS 858
SIC 1231
SIC 1235
SIC 1236
DES 1445
ITB 748
SPM 767
TUA 779
TUA 780
SPS 914' > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/remove.txt
Create file
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink2 \
--allow-extra-chr \
--remove output/snps_sets/linkage/remove.txt \
--bfile output/snps_sets/linkage/file3 \
--make-bed \
--out output/snps_sets/linkage/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/ld1.log
423 samples (44 females, 1 male, 378 ambiguous; 423 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/file3.bim. –remove: 409 samples remaining. 409 samples (41 females, 1 male, 367 ambiguous; 409 founders) remaining after
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.fam | sort | uniq -c | awk '{print $2, $1}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BRE 13
## BUL 10
## CES 14
## CRO 12
## DES 16
## FRS 12
## GES 12
## GRA 11
## GRC 10
## IMP 4
## ITB 5
## ITP 8
## ITR 12
## KER 12
## KRA 12
## MAL 12
## POL 2
## POP 12
## RAR 12
## ROM 4
## ROS 11
## SCH 5
## SER 4
## SEV 12
## SIC 9
## SLO 12
## SOC 12
## SPB 8
## SPC 6
## SPM 5
## SPS 8
## STS 7
## TIK 12
## TIR 4
## TRE 12
## TUA 9
## TUH 12
Create list of populations
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 6 {print}' | awk '{print $1}' > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/pops_4ld.txt;
head -n100 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/pops_4ld.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## ITP
## ITR
## KER
## KRA
## MAL
## POP
## RAR
## ROS
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPS
## STS
## TIK
## TRE
## TUA
## TUH
ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC ITP ITR KER KRA MAL POP RAR ROS SEV SIC SLO SOC SPB SPC SPS STS TIK TRE TUA TUH
Import the .bim file with the SNPs to create a new chromosomal scale.
# ____________________________________________________________________________
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Scaffold", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
## Scaffold SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583033226 0 161729 A G
## 2 1 AX-583033342 0 315059 C G
## 3 1 AX-583035163 0 315386 A G
## 4 1 AX-583033356 0 315674 C T
## 5 1 AX-583033370 0 330057 G A
## 6 1 AX-583035194 0 330265 A G
Separate the tibbles into each chromosome
# separate the SNP data per chromosome
# chr1
chr1_snps <-
snps |>
filter(
str_detect(
Scaffold, "^1."
)
) |> # here we get only Scaffold rows starting with 1
as_tibble() # save as tibble
#
# chr2
chr2_snps <-
snps |>
filter(
str_detect(
Scaffold, "^2."
)
) |>
as_tibble()
#
# chr3
chr3_snps <-
snps |>
filter(
str_detect(
Scaffold, "^3."
)
) |>
as_tibble()
Import the file with sizes of each scaffold.
# import the file with the scaffold sizes ####
sizes <-
read_delim(
here(
"/gpfs/gibbs/project/caccone/mkc54/albo/genome/scaffold_sizes.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "cd"
)
#
# set column names
colnames(
sizes
) <- c(
"Scaffold", "Size"
)
# ____________________________________________________________________________
# create new column with the chromosome number ####
sizes <-
sizes |>
mutate(
Chromosome = case_when( # we use mutate to create a new column called Chromosome
startsWith(
Scaffold, "1"
) ~ "1", # use startsWith to get Scaffold rows starting with 1 and output 1 on Chromosome column
startsWith(
Scaffold, "2"
) ~ "2",
startsWith(
Scaffold, "3"
) ~ "3"
)
) |>
arrange(
Scaffold
) # to sort the order of the scaffolds, fixing the problem we have with scaffold 1.86
# check it
head(sizes)
## # A tibble: 6 × 3
## Scaffold Size Chromosome
## <chr> <dbl> <chr>
## 1 1.1 351198 1
## 2 1.10 11939576 1
## 3 1.100 3389100 1
## 4 1.101 470438 1
## 5 1.102 2525157 1
## 6 1.103 150026 1
Create new scale. Get the scaffolds for each chromosome.
# separate the scaffold sizes tibble per chromosome ####
# chr1
chr1_scaffolds <-
sizes |>
filter(
str_detect(
Scaffold, "^1" # we use library stringr to get scaffolds starting with 1 (chromosome 1)
)
) |>
as_tibble()
#
# chr2
chr2_scaffolds <-
sizes |>
filter(
str_detect(
Scaffold, "^2" # we use library stringr to get scaffolds starting with 2 (chromosome 2)
)
) |>
as_tibble()
#
# # chr3
chr3_scaffolds <-
sizes |>
filter(
str_detect(
Scaffold, "^3" # we use library stringr to get scaffolds starting with 3 (chromosome 3)
)
) |>
as_tibble()
Create a scale for each chromosome.
# create a new scale for each chromosome ####
# chr1
chr1_scaffolds$overall_size_before_bp <-
0 # we create a new column with zeros
for (i in 2:nrow(
chr1_scaffolds
)
) { # loop to start on second line
chr1_scaffolds$overall_size_before_bp[i] <- # set position on the scale
chr1_scaffolds$overall_size_before_bp[i - 1] + chr1_scaffolds$Size[i - # add the scaffold size and the location to get position on new scale
1]
}
#
# chr2
chr2_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
chr2_scaffolds
)
) {
chr2_scaffolds$overall_size_before_bp[i] <-
chr2_scaffolds$overall_size_before_bp[i - 1] + chr2_scaffolds$Size[i -
1]
}
#
# chr3
chr3_scaffolds$overall_size_before_bp <- 0
for (i in 2:nrow(
chr3_scaffolds
)
) {
chr3_scaffolds$overall_size_before_bp[i] <-
chr3_scaffolds$overall_size_before_bp[i - 1] + chr3_scaffolds$Size[i -
1]
}
Merge the data frames scaffolds and SNPs.
# merge the data sets using the tidyverse function left_join ####
# chr1
chr1_scale <-
chr1_snps |> # create data frame for each chromosome, get chr1_snps
left_join( # use lef_join function to merge it with chr1_scaffolds
chr1_scaffolds,
by = "Scaffold"
) |> # set column to use for merging (Scaffold in this case)
na.omit() |> # remove NAs, we don't have SNPs in every scaffold
mutate(
midPos_fullseq = as.numeric(
Position
) + # make new columns numeric
as.numeric(
overall_size_before_bp
)
)
#
# chr2
chr2_scale <-
chr2_snps |>
left_join(
chr2_scaffolds,
by = "Scaffold"
) |>
na.omit() |>
mutate(
midPos_fullseq = as.numeric(
Position
) +
as.numeric(
overall_size_before_bp
)
)
#
# chr3
chr3_scale <-
chr3_snps |>
left_join(
chr3_scaffolds,
by = "Scaffold"
) |>
na.omit() |>
mutate(
midPos_fullseq = as.numeric(
Position
) +
as.numeric(
overall_size_before_bp
)
)
Merge all chromosome scales.
# merge the data sets, and select only the columns we need ####
chroms <- rbind(
chr1_scale, chr2_scale, chr3_scale
) |>
dplyr::select(
Chromosome, SNP, Cm, midPos_fullseq, Allele1, Allele2
)
# check it
head(chroms)
## # A tibble: 0 × 6
## # ℹ 6 variables: Chromosome <chr>, SNP <chr>, Cm <int>, midPos_fullseq <dbl>,
## # Allele1 <chr>, Allele2 <chr>
Save the new .bim file
# save the new bim file with a new name, I added "B" ####
write.table(
chroms,
file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.bim"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Create a new bed file with Plink2 to see if it works. For example, to see if the variants are in the right order. Plink2 will give us a warning.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink2 \
--bfile output/snps_sets/linkage/ld1 \
--make-bed \
--out output/snps_sets/linkage/test01;
# then we remove the files
rm output/snps_sets/linkage/test01.*
Start time: Wed Nov 1 16:25:59 2023 257256 MiB RAM detected; reserving 128628 MiB for main workspace. Using up to 32 threads (change this with –threads). 409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from output/snps_sets/linkage/ld1.fam. 107776 variants loaded from output/snps_sets/linkage/ld1.bim. Note: No phenotype data present. Writing output/snps_sets/linkage/test01.fam … done. Writing output/snps_sets/linkage/test01.bim … done. Writing output/snps_sets/linkage/test01.bed … done. End time: Wed Nov 1 16:25:59 2023
If no warnings from Plink2. Now, we can go ahead with our analysis.
We will use MAF 0.005 for each chromosome (default from PopLDdecay) Clean env and memory
We can estimate LD for each chromosome separated.
Import the .bim file with the SNPs to create a new chromosomal scale.
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/ld1.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583033226 0 161729 A G
## 2 1 AX-583033342 0 315059 C G
## 3 1 AX-583035163 0 315386 A G
## 4 1 AX-583033356 0 315674 C T
## 5 1 AX-583033370 0 330057 G A
## 6 1 AX-583035194 0 330265 A G
Separate the tibbles into each chromosome.
# chr1
chr1_snps <-
snps |>
filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
as_tibble() # save as tibble
#
# chr2
chr2_snps <-
snps |>
filter(Chromosome == 2) |>
as_tibble()
#
# chr3
chr3_snps <-
snps |>
filter(Chromosome == 3) |>
as_tibble()
We have objects with the SNPs for each chromosome
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583033226 0 161729 A G
## 2 1 AX-583033342 0 315059 C G
## 3 1 AX-583035163 0 315386 A G
## 4 1 AX-583033356 0 315674 C T
## 5 1 AX-583033370 0 330057 G A
## 6 1 AX-583035194 0 330265 A G
Filter the data by chromosome
#Chromosome 1
write.table(
chr1_snps$SNP,
file = here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
#Chromosome 2
write.table(
chr2_snps$SNP,
file = here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
#Chromosome 3
write.table(
chr3_snps$SNP,
file = here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Now create new files for each chromosome
Create dirs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
# make directory
mkdir -p output/snps_sets/linkage/chr1;
mkdir -p output/snps_sets/linkage/chr2;
mkdir -p output/snps_sets/linkage/chr3;
Chromosome 1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/ld1 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr1_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr1/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr1/ld1.log
409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/ld1.bim. –extract: 23976 variants remaining. –keep-fam: 376 samples remaining. 376 samples (36 females, 1 male, 339 ambiguous; 376 founders) remaining after 1614 variants removed due to allele frequency threshold(s) 22362 variants remaining after main filters.
Chromosome 2
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/ld1 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr2_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr2/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr2/ld1.log
409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/ld1.bim. –extract: 45220 variants remaining. –keep-fam: 376 samples remaining. 376 samples (36 females, 1 male, 339 ambiguous; 376 founders) remaining after 3154 variants removed due to allele frequency threshold(s) 42066 variants remaining after main filters.
Chromosome 3
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/ld1 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr3_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr3/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr3/ld1.log
409 samples (41 females, 1 male, 367 ambiguous; 409 founders) loaded from 107776 variants loaded from output/snps_sets/linkage/ld1.bim. –extract: 38580 variants remaining. –keep-fam: 376 samples remaining. 376 samples (36 females, 1 male, 339 ambiguous; 376 founders) remaining after 2544 variants removed due to allele frequency threshold(s) 36036 variants remaining after main filters.
Check the vcfs
Prepare the files required by PopLDdecay
bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf > output/snps_sets/linkage/vcf_samples.txt
bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf | cut -d'_' -f1 | sort | uniq > output/snps_sets/linkage/unique_pops_from_vcf.txt
while read pop; do
grep "^${pop}_" output/snps_sets/linkage/vcf_samples.txt > output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Create conda environment for PopLDdecay
module load miniconda
conda create -n PopLDdecay-env python pip
conda activate PopLDdecay-env
pip install PopLDdecay
cd PopLDdecay; chmod 755 configure; ./configure;
make;
mv PopLDdecay bin/; # [rm *.o]
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
Versions:
Perl/5.28.0-GCCcore-7.3.0
Perl/5.32.0-GCCcore-10.2.0-minimal
Perl/5.32.0-GCCcore-10.2.0
Perl/5.32.1-GCCcore-10.2.0
Perl/5.36.0-GCCcore-12.2.0
Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr1/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr1/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4653
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 16989
# ##SampleSize*2 = 20
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3350
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 18893
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3139
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19130
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3395
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 18735
# ##SampleSize*2 = 20
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4501
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17502
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 13
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4727
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 16763
# ##SampleSize*2 = 26
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4048
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17969
# ##SampleSize*2 = 20
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 14
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4307
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5385
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 15545
# ##SampleSize*2 = 28
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3070
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19188
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 16
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3178
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 18325
# ##SampleSize*2 = 32
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2256
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 20036
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4676
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17568
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 11
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2573
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19372
# ##SampleSize*2 = 22
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 10
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2768
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19294
# ##SampleSize*2 = 20
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 8
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2907
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19209
# ##SampleSize*2 = 16
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2535
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19754
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4107
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 18153
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2969
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19150
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2137
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 20146
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2088
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 20198
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5220
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17000
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 11
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3431
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 18509
# ##SampleSize*2 = 22
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5457
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 16749
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 9
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4094
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17469
# ##SampleSize*2 = 18
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :1836
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 20261
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4848
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17336
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 8
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3255
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19005
# ##SampleSize*2 = 16
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 6
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5745
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 16159
# ##SampleSize*2 = 12
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 8
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :5607
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 16430
# ##SampleSize*2 = 16
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 7
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4307
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17444
# ##SampleSize*2 = 14
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :4858
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 17335
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :1865
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 20429
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 9
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :3611
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 18409
# ##SampleSize*2 = 18
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
# the Number of subPop samples[found in VCF] is 12
# Warning skip non bi-allelic(Singleton/ThreeMulti allelic) site, and total skip allelic sites number is :2673
# ##begin pair-wise R^2 cal/D'.
# After filter Remain SNP Number : 19582
# ##SampleSize*2 = 24
# Used [perl ../bin/Plot_XX.pl ] to Plot the LDdecay
Get the files we need
for file in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ld_decay_results_list.txt
head -100 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ld_decay_results_list.txt
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ALD_LDdecay.stat.gz ALD
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ALU_LDdecay.stat.gz ALU
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ALV_LDdecay.stat.gz ALV
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ARM_LDdecay.stat.gz ARM
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/BAR_LDdecay.stat.gz BAR
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/BRE_LDdecay.stat.gz BRE
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/BUL_LDdecay.stat.gz BUL
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/CES_LDdecay.stat.gz CES
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/CRO_LDdecay.stat.gz CRO
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/DES_LDdecay.stat.gz DES
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/FRS_LDdecay.stat.gz FRS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/GES_LDdecay.stat.gz GES
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/GRA_LDdecay.stat.gz GRA
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/GRC_LDdecay.stat.gz GRC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ITP_LDdecay.stat.gz ITP
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ITR_LDdecay.stat.gz ITR
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/KER_LDdecay.stat.gz KER
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/KRA_LDdecay.stat.gz KRA
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/MAL_LDdecay.stat.gz MAL
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/POP_LDdecay.stat.gz POP
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/RAR_LDdecay.stat.gz RAR
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ROS_LDdecay.stat.gz ROS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SEV_LDdecay.stat.gz SEV
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SIC_LDdecay.stat.gz SIC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SLO_LDdecay.stat.gz SLO
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SOC_LDdecay.stat.gz SOC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SPB_LDdecay.stat.gz SPB
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SPC_LDdecay.stat.gz SPC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/SPS_LDdecay.stat.gz SPS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/STS_LDdecay.stat.gz STS
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TIK_LDdecay.stat.gz TIK
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TRE_LDdecay.stat.gz TRE
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TUA_LDdecay.stat.gz TUA
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/TUH_LDdecay.stat.gz TUH
output/snps_sets/linkage/chr1/ALD_LDdecay.stat.gz ALD output/snps_sets/linkage/chr1/ALU_LDdecay.stat.gz ALU output/snps_sets/linkage/chr1/ALV_LDdecay.stat.gz ALV output/snps_sets/linkage/chr1/ARM_LDdecay.stat.gz ARM output/snps_sets/linkage/chr1/BAR_LDdecay.stat.gz BAR output/snps_sets/linkage/chr1/BRE_LDdecay.stat.gz BRE output/snps_sets/linkage/chr1/BUL_LDdecay.stat.gz BUL output/snps_sets/linkage/chr1/CES_LDdecay.stat.gz CES output/snps_sets/linkage/chr1/CRO_LDdecay.stat.gz CRO output/snps_sets/linkage/chr1/DES_LDdecay.stat.gz DES output/snps_sets/linkage/chr1/FRS_LDdecay.stat.gz FRS output/snps_sets/linkage/chr1/GES_LDdecay.stat.gz GES output/snps_sets/linkage/chr1/GRA_LDdecay.stat.gz GRA output/snps_sets/linkage/chr1/GRC_LDdecay.stat.gz GRC output/snps_sets/linkage/chr1/ITP_LDdecay.stat.gz ITP output/snps_sets/linkage/chr1/ITR_LDdecay.stat.gz ITR output/snps_sets/linkage/chr1/KER_LDdecay.stat.gz KER output/snps_sets/linkage/chr1/KRA_LDdecay.stat.gz KRA output/snps_sets/linkage/chr1/MAL_LDdecay.stat.gz MAL output/snps_sets/linkage/chr1/POP_LDdecay.stat.gz POP output/snps_sets/linkage/chr1/RAR_LDdecay.stat.gz RAR output/snps_sets/linkage/chr1/ROS_LDdecay.stat.gz ROS output/snps_sets/linkage/chr1/SEV_LDdecay.stat.gz SEV output/snps_sets/linkage/chr1/SIC_LDdecay.stat.gz SIC output/snps_sets/linkage/chr1/SLO_LDdecay.stat.gz SLO output/snps_sets/linkage/chr1/SOC_LDdecay.stat.gz SOC output/snps_sets/linkage/chr1/SPB_LDdecay.stat.gz SPB output/snps_sets/linkage/chr1/SPC_LDdecay.stat.gz SPC output/snps_sets/linkage/chr1/SPS_LDdecay.stat.gz SPS output/snps_sets/linkage/chr1/STS_LDdecay.stat.gz STS output/snps_sets/linkage/chr1/TIK_LDdecay.stat.gz TIK output/snps_sets/linkage/chr1/TRE_LDdecay.stat.gz TRE output/snps_sets/linkage/chr1/TUA_LDdecay.stat.gz TUA output/snps_sets/linkage/chr1/TUH_LDdecay.stat.gz TUH
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(RColorBrewer)
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
p
Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/decay_chr1.pdf", plot = p, width = 5, height = 8)
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1327468 70.9 3366742 179.9 2157587 115.3
## Vcells 2818950 21.6 16153512 123.3 18405414 140.5
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr2/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr2/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/snps_sets/linkage/chr2/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr2/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr2/ld_decay_results_list.txt
output/snps_sets/linkage/chr2/ALD_LDdecay.stat.gz ALD output/snps_sets/linkage/chr2/ALU_LDdecay.stat.gz ALU output/snps_sets/linkage/chr2/ALV_LDdecay.stat.gz ALV output/snps_sets/linkage/chr2/ARM_LDdecay.stat.gz ARM output/snps_sets/linkage/chr2/BAR_LDdecay.stat.gz BAR output/snps_sets/linkage/chr2/BRE_LDdecay.stat.gz BRE output/snps_sets/linkage/chr2/BUL_LDdecay.stat.gz BUL output/snps_sets/linkage/chr2/CES_LDdecay.stat.gz CES output/snps_sets/linkage/chr2/CRO_LDdecay.stat.gz CRO output/snps_sets/linkage/chr2/DES_LDdecay.stat.gz DES output/snps_sets/linkage/chr2/FRS_LDdecay.stat.gz FRS output/snps_sets/linkage/chr2/GES_LDdecay.stat.gz GES output/snps_sets/linkage/chr2/GRA_LDdecay.stat.gz GRA output/snps_sets/linkage/chr2/GRC_LDdecay.stat.gz GRC output/snps_sets/linkage/chr2/ITP_LDdecay.stat.gz ITP output/snps_sets/linkage/chr2/ITR_LDdecay.stat.gz ITR output/snps_sets/linkage/chr2/KER_LDdecay.stat.gz KER output/snps_sets/linkage/chr2/KRA_LDdecay.stat.gz KRA output/snps_sets/linkage/chr2/MAL_LDdecay.stat.gz MAL output/snps_sets/linkage/chr2/POP_LDdecay.stat.gz POP output/snps_sets/linkage/chr2/RAR_LDdecay.stat.gz RAR output/snps_sets/linkage/chr2/ROS_LDdecay.stat.gz ROS output/snps_sets/linkage/chr2/SEV_LDdecay.stat.gz SEV output/snps_sets/linkage/chr2/SIC_LDdecay.stat.gz SIC output/snps_sets/linkage/chr2/SLO_LDdecay.stat.gz SLO output/snps_sets/linkage/chr2/SOC_LDdecay.stat.gz SOC output/snps_sets/linkage/chr2/SPB_LDdecay.stat.gz SPB output/snps_sets/linkage/chr2/SPC_LDdecay.stat.gz SPC output/snps_sets/linkage/chr2/SPS_LDdecay.stat.gz SPS output/snps_sets/linkage/chr2/STS_LDdecay.stat.gz STS output/snps_sets/linkage/chr2/TIK_LDdecay.stat.gz TIK output/snps_sets/linkage/chr2/TRE_LDdecay.stat.gz TRE output/snps_sets/linkage/chr2/TUA_LDdecay.stat.gz TUA output/snps_sets/linkage/chr2/TUH_LDdecay.stat.gz TUH
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
p
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/decay_chr2.pdf", plot = p, width = 5, height = 8)
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1327735 71.0 3366742 179.9 2320596 124.0
## Vcells 2820093 21.6 15571372 118.9 18405414 140.5
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr3/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr3/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
for file in output/snps_sets/linkage/chr3/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr3/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr3/ld_decay_results_list.txt
## output/snps_sets/linkage/chr3/ALD_LDdecay.stat.gz ALD
## output/snps_sets/linkage/chr3/ALU_LDdecay.stat.gz ALU
## output/snps_sets/linkage/chr3/ALV_LDdecay.stat.gz ALV
## output/snps_sets/linkage/chr3/ARM_LDdecay.stat.gz ARM
## output/snps_sets/linkage/chr3/BAR_LDdecay.stat.gz BAR
## output/snps_sets/linkage/chr3/BRE_LDdecay.stat.gz BRE
## output/snps_sets/linkage/chr3/BUL_LDdecay.stat.gz BUL
## output/snps_sets/linkage/chr3/CES_LDdecay.stat.gz CES
## output/snps_sets/linkage/chr3/CRO_LDdecay.stat.gz CRO
## output/snps_sets/linkage/chr3/DES_LDdecay.stat.gz DES
## output/snps_sets/linkage/chr3/FRS_LDdecay.stat.gz FRS
## output/snps_sets/linkage/chr3/GES_LDdecay.stat.gz GES
## output/snps_sets/linkage/chr3/GRA_LDdecay.stat.gz GRA
## output/snps_sets/linkage/chr3/GRC_LDdecay.stat.gz GRC
## output/snps_sets/linkage/chr3/ITP_LDdecay.stat.gz ITP
## output/snps_sets/linkage/chr3/ITR_LDdecay.stat.gz ITR
## output/snps_sets/linkage/chr3/KER_LDdecay.stat.gz KER
## output/snps_sets/linkage/chr3/KRA_LDdecay.stat.gz KRA
## output/snps_sets/linkage/chr3/MAL_LDdecay.stat.gz MAL
## output/snps_sets/linkage/chr3/POP_LDdecay.stat.gz POP
## output/snps_sets/linkage/chr3/RAR_LDdecay.stat.gz RAR
## output/snps_sets/linkage/chr3/ROS_LDdecay.stat.gz ROS
## output/snps_sets/linkage/chr3/SEV_LDdecay.stat.gz SEV
## output/snps_sets/linkage/chr3/SIC_LDdecay.stat.gz SIC
## output/snps_sets/linkage/chr3/SLO_LDdecay.stat.gz SLO
## output/snps_sets/linkage/chr3/SOC_LDdecay.stat.gz SOC
## output/snps_sets/linkage/chr3/SPB_LDdecay.stat.gz SPB
## output/snps_sets/linkage/chr3/SPC_LDdecay.stat.gz SPC
## output/snps_sets/linkage/chr3/SPS_LDdecay.stat.gz SPS
## output/snps_sets/linkage/chr3/STS_LDdecay.stat.gz STS
## output/snps_sets/linkage/chr3/TIK_LDdecay.stat.gz TIK
## output/snps_sets/linkage/chr3/TRE_LDdecay.stat.gz TRE
## output/snps_sets/linkage/chr3/TUA_LDdecay.stat.gz TUA
## output/snps_sets/linkage/chr3/TUH_LDdecay.stat.gz TUH
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
#p
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/decay_chr3.pdf", plot = p, width = 5, height = 8)
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1769332 94.5 3366742 179.9 2320596 124.0
## Vcells 6059018 46.3 25471977 194.4 24446946 186.6
Create data frame with the data from each chromosome
Import the data
# Chromosome 1
chr1 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr1/chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr1 = half_distance
) |>
mutate(chr1 = chr1/1000)
# Chromosome 2
chr2 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr2/chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr2 = half_distance
) |>
mutate(chr2 = chr2/1000)
# Chromosome 3
chr3 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr3 = half_distance
) |>
mutate(chr3 = chr3/1000)
# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")
# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)
# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)
# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")
Make one plot
# Define a custom color palette
custom_palette <- c(
"Chr1" = "orange",
"Chr2" = "green3",
"Chr3" = "blue"
)
# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))
# source the plotting function
source(here("analyses", "my_theme2.R"))
# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 3) +
my_theme() +
coord_flip() +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_comparison.pdf", width = 6, height = 12)
Import sample locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
## population chr value
## 1 ALD Chr1 52.3
## 2 ALU Chr1 24.8
## 3 ALV Chr1 17.8
## 4 ARM Chr1 23.0
## 5 BAR Chr1 71.7
## 6 BRE Chr1 28.2
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
## population chr value Pop_City Country Latitude Longitude Continent Year
## 1 ALD Chr1 52.3 Durres Albania 41.29704 19.50373 Europe 2018
## 2 ALU Chr1 24.8 Alushta Ukraine 44.68289 34.40368 Europe 2021
## 3 ALV Chr1 17.8 Vlore Albania 40.46600 19.48970 Europe 2020
## 4 ARM Chr1 23.0 Ijevan Armenia 40.87971 45.14764 Europe 2020
## 5 BAR Chr1 71.7 Barcelona Spain 41.38510 2.17340 Europe 2018
## 6 BRE Chr1 28.2 Brescia Italy 45.53373 10.20445 Europe 1995
## Region Subregion order
## 1 Southern Europe East Europe 25
## 2 Eastern Europe East Europe 35
## 3 Southern Europe East Europe 24
## 4 Eastern Europe East Europe 42
## 5 Southern Europe West Europe 8
## 6 Southern Europe West Europe 12
Add the name of the city/countries to the plot
# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")
# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)
# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)
# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
my_theme() +
coord_flip() +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
axis.text.y = element_text(size = 7, angle = 0, hjust = 1))
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_comp_poplabels.pdf", width = 6, height = 12)
# Create table
dist_table <- distance2 |>
dplyr::select(
population, Pop_City, Country, chr, value
) |>
dplyr::rename(
City = Pop_City,
Population = population
)
# Spread and arrange
dist_table_condensed <- dist_table %>%
spread(key = chr, value = value) %>%
dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
dplyr::arrange(Country, City)
# Create table
ft <- flextable::flextable(dist_table_condensed)
# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)
# Show it
ft
Population | Country | City | Chr1 | Chr2 | Chr3 |
---|---|---|---|---|---|
ALD | Albania | Durres | 52.3 | 109.4 | 47.2 |
ALV | Albania | Vlore | 17.8 | 0.5 | 79.9 |
ARM | Armenia | Ijevan | 23.0 | 83.5 | 104.5 |
BUL | Bulgaria | Lom | 81.7 | 75.0 | 164.1 |
CRO | Croatia | Dubrovnik | 23.8 | 6.0 | 119.0 |
FRS | France | Saint-Martin-d'Heres | 1.9 | 43.2 | 59.2 |
STS | France | Strasbourg | 65.2 | 136.3 | 177.8 |
GES | Georgia | Sakhumi, Abkhazia | 36.3 | 75.0 | 139.2 |
GRA | Greece | Athens | 21.9 | 61.4 | 2.7 |
GRC | Greece | Chania | 79.8 | 161.1 | 87.9 |
BRE | Italy | Brescia | 28.2 | 57.0 | 90.1 |
CES | Italy | Cesena | 6.2 | 93.2 | 37.3 |
DES | Italy | Desenzano | 29.5 | 50.9 | 46.1 |
ITP | Italy | Puglia | 134.5 | 166.2 | 290.9 |
ITR | Italy | Rome (Trappola) | 23.0 | 42.4 | 42.4 |
SIC | Italy | Sicilia | 82.3 | 126.1 | 124.2 |
TRE | Italy | Trentino | 36.4 | 36.1 | 35.3 |
MAL | Malta | Luqa | 9.0 | 33.5 | 86.4 |
POP | Portugal | Penafiel | 10.5 | 30.3 | 25.0 |
ROS | Romania | Satu Mare | 13.6 | 63.1 | 83.6 |
RAR | Russia | Armavir | 24.8 | 115.2 | 73.9 |
KRA | Russia | Krasnodar | 31.2 | 15.4 | 50.3 |
SOC | Russia | Sochi | 40.6 | 100.8 | 77.9 |
TIK | Russia | Tikhoretsk | 14.2 | 75.9 | 163.9 |
SLO | Slovenia | Ajdovscina | 20.3 | 13.9 | 34.8 |
SPB | Spain | Badajoz | 4.3 | 56.1 | 55.8 |
BAR | Spain | Barcelona | 71.7 | 3.5 | 61.0 |
SPC | Spain | Catarroja | 45.4 | 86.2 | 162.9 |
SPS | Spain | San Roque | 90.2 | 374.2 | 42.8 |
TUA | Turkey | Aliaga | 97.6 | 182.0 | 162.5 |
TUH | Turkey | Hopa | 4.7 | 0.3 | 100.6 |
ALU | Ukraine | Alushta | 24.8 | 77.8 | 64.4 |
KER | Ukraine | Kerch, Crimea | 47.4 | 110.9 | 82.4 |
SEV | Ukraine | Sevastopol, Crimea | 11.3 | 64.5 | 78.1 |
# Create a new Word document
doc <- read_docx()
# Add the flextable
doc <- body_add_flextable(doc, value = ft)
# Save the document to a file
# Define the path for saving the Word document
file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_albopictus.docx")
print(doc, target = file_path)
Get Counts
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/chr3/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
##
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC ITP ITR KER KRA MAL POP
## 10 12 12 10 12 13 10 14 12 16 12 12 11 10 8 12 12 12 12 12
## RAR ROS SEV SIC SLO SOC SPB SPC SPS STS TIK TRE TUA TUH
## 12 11 12 9 12 12 8 6 8 7 12 12 9 12
Merge the data
# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")
# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")
Check correlation
# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
geom_point() +
geom_smooth(method = "lm", col = "orange") +
ggtitle("Correlation between Count and Chr1") +
labs(x = "Count", y = "LD Half Distance - Chr1")
# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
geom_point() +
geom_smooth(method = "lm", col = "green3") +
ggtitle("Correlation between Count and Chr2") +
labs(x = "Count", y = "LD Half Distance - Chr2")
# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Correlation between Count and Chr3") +
labs(x = "Count", y = "LD Half Distance - Chr3")
# Display the plots
print(plot_Chr1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]
# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]
# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]
# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.3023191 , p-value = 0.0007558585
## For Chr2: R2 = 0.2743179 , p-value = 0.001478025
## For Chr3: R2 = 0.2574468 , p-value = 0.002192244
# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]
# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]
# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]
# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.3023191 , p-value = 0.0007558585
## For Chr2: R2 = 0.2743179 , p-value = 0.001478025
## For Chr3: R2 = 0.2574468 , p-value = 0.002192244
Annotate the plots
# Helper function to extract coefficients, R2, and create the label
get_annotation <- function(model) {
coefs <- coef(model)
eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
return(paste(eq, r2, sep = "\n"))
}
# Annotations for each chromosome
annotate_Chr1 <- get_annotation(model_Chr1)
annotate_Chr2 <- get_annotation(model_Chr2)
annotate_Chr3 <- get_annotation(model_Chr3)
# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "orange") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot1 <- plot_with_annotation(merged_data, "Chr1", annotate_Chr1)
# Display the plots with annotations
print(plot1)
## `geom_smooth()` using formula = 'y ~ x'
# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "green3") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot2 <- plot_with_annotation(merged_data, "Chr2", annotate_Chr2)
print(plot2)
## `geom_smooth()` using formula = 'y ~ x'
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot3 <- plot_with_annotation(merged_data, "Chr3", annotate_Chr3)
print(plot3)
## `geom_smooth()` using formula = 'y ~ x'
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "City", "Count"), variable.name = "Chromosome")
saveRDS(merged_data_long, here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/merged_data_long.rds"
))
Create a facet plot
merged_data_long <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/merged_data_long.rds"))
# Function to compute the linear model details
get_lm_details <- function(data, yvar) {
model <- lm(data[[yvar]] ~ data$Count)
coefs <- coef(model)
eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
return(tibble(Chromosome = yvar, Equation = eq, R2 = r2))
}
# Compute details for each chromosome and bind rows
annotations <- bind_rows(get_lm_details(merged_data, "Chr1"),
get_lm_details(merged_data, "Chr2"),
get_lm_details(merged_data, "Chr3"))
# Compute maximum y for annotations for each chromosome
annotations <- annotations %>%
left_join(merged_data_long %>% group_by(Chromosome) %>% summarise(MaxY = max(value, na.rm = TRUE)), by = "Chromosome")
# Plotting with annotations correctly positioned in the top right corner
facet_plot <- ggplot(merged_data_long, aes(x = Count, y = value)) +
geom_point() +
geom_smooth(method = "lm", col = "black") +
geom_text(data = annotations, aes(label = paste(Equation, R2, sep = "\n"), y = MaxY),
x = max(merged_data$Count), hjust = 1, vjust = 1) +
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
labs(x = "Number of Samples", y = "LD Half Distance (kb)") +
my_theme()
print(facet_plot)
## `geom_smooth()` using formula = 'y ~ x'
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_sample_size.pdf"), width = 4, height = 6)
Plot all countries
# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
group_by(Country, Chromosome) %>%
summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = value)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(aes(color = Country), width = 0.2) +
geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 3) + # Annotate mean value
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
theme_minimal() +
labs(title = "", y = "LD Half Distance (kb)") +
my_theme() +
coord_flip() +
theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none")
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_mean_by_country.pdf"), width = 6, height = 6)
Legend: The Linkage Disequilibrium (LD) half distance values in kilobases (kb) for the Aedes albopictus mosquito across countries, grouped by chromosome. Each boxplot displays the interquartile range of the LD Half Distance values, with the horizontal line in the box marking the median. Colored jittered points represent individual data points, with each color corresponding to a specific country. The plot also annotates each country-chromosome combination’s mean LD half distance. Distinct facets separate the values for different chromosomes to offer clear visualization.
Use ggstatplot to test if there is any significant differences Chr1
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(multcompView)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr1, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr1,
title = "Chr1",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_chr1_test.pdf"), width = 7, height = 6)
Chr2
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr2, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr2,
title = "Chr2",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/linkage/decay_chr2_test.pdf"), width = 7, height = 6)
Chr3
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr3, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr3,
title = "Chr3",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + my_theme() + theme(legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
Mosquitoes removed due excess heterozygosity
## QNC 1248
## KAT 601
## KAT 605
## KAT 606
## KAT 608
## GRA 734
## TUA 783
## ITP 832
## ROS 858
Mosquitoes removed duo to relatedness
## #FID IID
## SIC 1231
## SIC 1235
## SIC 1236
## KAN 1327
## DES 1445
## REC 179
## ITB 748
## SPM 767
## TUA 779
## TUA 780
## SPS 914
Create a list of individuals we removed
echo 'SIC 1231
SIC 1235
SIC 1236
KAN 1327
DES 1445
REC 179
ITB 748
SPM 767
TUA 779
TUA 780
SPS 914' > /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/remove.txt
We can start with file7 (copied to the new linkage directory), because in the QC Markdown we created this file using the chromosomal scale, and it already has been filtered for relatedness, heterozygosity, missingness, etc.
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 16
## FRS 12
## GEL 2
## GES 12
## GRA 11
## GRC 10
## GRV 12
## HAI 12
## HAN 4
## HOC 7
## HUN 12
## IMP 4
## INJ 11
## INW 4
## ITB 5
## ITP 9
## ITR 12
## JAF 2
## KAC 6
## KAG 12
## KAN 11
## KAT 6
## KER 12
## KLP 4
## KRA 12
## KUN 4
## LAM 9
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POL 2
## POP 12
## QNC 11
## RAR 12
## REC 11
## ROM 4
## ROS 11
## SER 4
## SEV 12
## SIC 9
## SLO 12
## SOC 12
## SON 3
## SPB 8
## SPC 6
## SPM 5
## SPS 8
## SSK 12
## STS 12
## SUF 6
## SUU 6
## TAI 7
## TIK 12
## TIR 4
## TRE 12
## TUA 9
## TUH 12
## UTS 12
## YUN 9
Create list of populations
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 6 {print}' | awk '{print $1}' > output/snps_sets/linkage/pops_4ld.txt;
head -n100 output/snps_sets/linkage/pops_4ld.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## GRV
## HAI
## HOC
## HUN
## INJ
## ITP
## ITR
## KAC
## KAG
## KAN
## KAT
## KER
## KRA
## LAM
## MAL
## MAT
## OKI
## PAL
## POP
## QNC
## RAR
## REC
## ROS
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPS
## SSK
## STS
## SUF
## SUU
## TAI
## TIK
## TRE
## TUA
## TUH
## UTS
## YUN
We will use MAF 0.01 for each chromosome (SNP Set 3)
We can estimate LD for each chromosome separated.
Import the .bim file with the SNPs to create a new chromosomal scale.
# import the bim file with the SNP data ####
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/file7.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583035163 0 315386 A G
## 2 1 AX-583033356 0 315674 C T
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035257 0 442875 T C
Separate the tibbles into each chromosome.
# chr1
chr1_snps <-
snps |>
filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
as_tibble() # save as tibble
#
# chr2
chr2_snps <-
snps |>
filter(Chromosome == 2) |>
as_tibble()
#
# chr3
chr3_snps <-
snps |>
filter(Chromosome == 3) |>
as_tibble()
We have objects with the SNPs for each chromosome
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583035163 0 315386 A G
## 2 1 AX-583033356 0 315674 C T
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035257 0 442875 T C
Filter the data by chromosome
#Chromosome 1
write.table(
chr1_snps$SNP,
file = here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
#Chromosome 2
write.table(
chr2_snps$SNP,
file = here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
#Chromosome 3
write.table(
chr3_snps$SNP,
file = here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_ld_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Now create new files for each chromosome
Create dirs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
# make directory
mkdir -p output/snps_sets/linkage/chr1;
mkdir -p output/snps_sets/linkage/chr2;
mkdir -p output/snps_sets/linkage/chr3;
Chromosome 1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/file7 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr1_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr1/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr1/ld1.log
688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –extract: 19362 variants remaining. –keep-fam: 637 samples remaining. 637 samples (80 females, 60 males, 497 ambiguous; 637 founders) remaining after 0 variants removed due to allele frequency threshold(s) 19362 variants remaining after main filters.
Chromosome 2
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/file7 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr2_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr2/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr2/ld1.log
688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –extract: 36611 variants remaining. –keep-fam: 637 samples remaining. 637 samples (80 females, 60 males, 497 ambiguous; 637 founders) remaining after 0 variants removed due to allele frequency threshold(s) 36611 variants remaining after main filters.
Chromosome 3
plink2 \
--allow-extra-chr \
--bfile output/snps_sets/linkage/file7 \
--keep-fam output/snps_sets/linkage/pops_4ld.txt \
--extract output/snps_sets/linkage/chr3_ld_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out output/snps_sets/linkage/chr3/ld1 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/chr3/ld1.log
688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –extract: 31210 variants remaining. –keep-fam: 637 samples remaining. 637 samples (80 females, 60 males, 497 ambiguous; 637 founders) remaining after 0 variants removed due to allele frequency threshold(s) 31210 variants remaining after main filters.
Check the vcfs
## output/snps_sets/linkage/chr1/ld1.vcf
## output/snps_sets/linkage/chr2/ld1.vcf
## output/snps_sets/linkage/chr3/ld1.vcf
Prepare the files required by PopLDdecay
bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf > output/snps_sets/linkage/vcf_samples.txt
bcftools query -l output/snps_sets/linkage/chr1/ld1.vcf | cut -d'_' -f1 | sort | uniq > output/snps_sets/linkage/unique_pops_from_vcf.txt
while read pop; do
grep "^${pop}_" output/snps_sets/linkage/vcf_samples.txt > output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr1/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr1/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
for file in output/snps_sets/linkage/chr1/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr1/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr1/ld_decay_results_list.txt
Exit the conda environment and load R
module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
-in output/snps_sets/linkage/chr1/ld_decay_results_list.txt \
-output output/snps_sets/linkage/chr1/all_pops \
-bin1 10 \
-bin2 100 \
-break 100 \
-maxX 1000 \
-measure r2 \
-method MeanBin \
-keepR
# Define the path to the .fam file using here
library(ggrepel)
library(RColorBrewer)
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
library(scales)
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
#p
#If you get an "Error in grid.Call...", the figure is too big for you viewer. Just save it as a pdf instead and open the pdf to view.
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr1.pdf"),
width = 14,
height = 16,
units = "in"
)
Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4312232 230.3 7322801 391.1 4966314 265.3
## Vcells 12977215 99.1 47019476 358.8 45802637 349.5
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr2/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr2/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/snps_sets/linkage/chr2/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr2/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr2/ld_decay_results_list.txt
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
#p
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr2.pdf"),
width = 14,
height = 16,
units = "in"
)
Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4312317 230.4 7322801 391.1 5619349 300.2
## Vcells 13039134 99.5 47019476 358.8 47019323 358.8
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr3/ld1.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr3/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/snps_sets/linkage/chr3/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr3/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr3/ld_decay_results_list.txt
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
#p
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr3.pdf"),
width = 14,
height = 16,
units = "in"
)
Save the data to use later
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4312385 230.4 7322801 391.1 5619349 300.2
## Vcells 13040010 99.5 47019476 358.8 47019323 358.8
Create data frame with the data from each chromosome
Import the data
# Chromosome 1
chr1 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1/chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr1 = half_distance
) |>
mutate(chr1 = chr1/1000)
# Chromosome 2
chr2 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2/chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr2 = half_distance
) |>
mutate(chr2 = chr2/1000)
# Chromosome 3
chr3 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr3 = half_distance
) |>
mutate(chr3 = chr3/1000)
# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")
# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)
# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)
# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")
Make one plot
# Define a custom color palette
custom_palette <- c(
"Chr1" = "orange",
"Chr2" = "green3",
"Chr3" = "blue"
)
# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))
# source the plotting function
source(here("analyses", "my_theme2.R"))
# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
my_theme() +
coord_flip() +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/ld/decay_comparison.pdf", width = 8, height = 14)
Import sample locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
## population chr value
## 1 ALD Chr1 40.7
## 2 ALU Chr1 9.9
## 3 ALV Chr1 36.7
## 4 ARM Chr1 14.2
## 5 BAR Chr1 20.3
## 6 BEN Chr1 6.3
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
## population chr value Pop_City Country Latitude Longitude Continent Year
## 1 ALD Chr1 40.7 Durres Albania 41.29704 19.50373 Europe 2018
## 2 ALU Chr1 9.9 Alushta Ukraine 44.68289 34.40368 Europe 2021
## 3 ALV Chr1 36.7 Vlore Albania 40.46600 19.48970 Europe 2020
## 4 ARM Chr1 14.2 Ijevan Armenia 40.87971 45.14764 Europe 2020
## 5 BAR Chr1 20.3 Barcelona Spain 41.38510 2.17340 Europe 2018
## 6 BEN Chr1 6.3 Bengaluru India 12.97160 77.59460 Asia Unknown
## Region Subregion order
## 1 Southern Europe East Europe 25
## 2 Eastern Europe East Europe 35
## 3 Southern Europe East Europe 24
## 4 Eastern Europe East Europe 42
## 5 Southern Europe West Europe 8
## 6 South Asia 52
Add the name of the city/countries to the plot
# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")
# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)
# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)
# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
my_theme() +
coord_flip() +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
axis.text.y = element_text(size = 5, angle = 0, hjust = 1))
Save
# Create table
dist_table <- distance2 |>
dplyr::select(
population, Pop_City, Country, chr, value
) |>
dplyr::rename(
City = Pop_City,
Population = population
)
# Spread and arrange
dist_table_condensed <- dist_table %>%
spread(key = chr, value = value) %>%
dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
dplyr::arrange(Country, City)
# Create table
ft <- flextable::flextable(dist_table_condensed)
# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)
# Show it
ft
Population | Country | City | Chr1 | Chr2 | Chr3 |
---|---|---|---|---|---|
ALD | Albania | Durres | 40.7 | 72.3 | 47.2 |
ALV | Albania | Vlore | 36.7 | 2.1 | 81.2 |
ARM | Armenia | Ijevan | 14.2 | 53.4 | 37.8 |
GRV | Brazil | Gravatai | 26.8 | 183.0 | 64.1 |
REC | Brazil | Recife, PE | 5.4 | 141.4 | 89.2 |
BUL | Bulgaria | Lom | 57.9 | 95.5 | 57.0 |
CAM | Cambodia | Phnom Penh | 16.7 | 70.2 | 0.9 |
HAI | China | Hainan | 1.5 | 71.5 | 4.6 |
HUN | China | Hunan | 12.1 | 57.9 | 45.7 |
YUN | China | Yunnan | 6.7 | 153.8 | 1.4 |
CRO | Croatia | Dubrovnik | 23.1 | 1.6 | 34.2 |
FRS | France | Saint-Martin-d'Heres | 2.1 | 25.5 | 56.9 |
STS | France | Strasbourg | 33.3 | 39.5 | 88.6 |
GES | Georgia | Sakhumi, Abkhazia | 5.6 | 94.3 | 120.4 |
GRA | Greece | Athens | 32.2 | 40.3 | 0.5 |
GRC | Greece | Chania | 62.0 | 141.4 | 87.9 |
BEN | India | Bengaluru | 6.3 | 116.5 | 53.2 |
INJ | Indonesia | Jakarta | 8.0 | 90.8 | 58.8 |
SUF | Indonesia | Sulawesi (Forest) | 5.9 | 115.2 | 170.9 |
SUU | Indonesia | Sulawesi (Urban) | 35.3 | 200.2 | 196.1 |
BRE | Italy | Brescia | 31.6 | 58.1 | 68.4 |
CES | Italy | Cesena | 6.2 | 93.2 | 34.4 |
DES | Italy | Desenzano | 27.3 | 52.9 | 36.8 |
ITP | Italy | Puglia | 27.4 | 2.9 | 120.1 |
ITR | Italy | Rome (Trappola) | 40.1 | 51.1 | 36.8 |
SIC | Italy | Sicilia | 32.4 | 52.9 | 124.8 |
TRE | Italy | Trentino | 26.5 | 36.7 | 35.3 |
KAG | Japan | Kagoshima | 26.7 | 17.6 | 20.6 |
KAN | Japan | Kanazawa | 23.0 | 36.3 | 30.6 |
OKI | Japan | Okinawa | 25.3 | 21.4 | 77.7 |
UTS | Japan | Utsunomiya | 35.2 | 14.6 | 32.2 |
MAT | Malaysia | Tambun | 25.5 | 182.4 | 45.3 |
MAL | Malta | Luqa | 36.7 | 33.5 | 32.2 |
POP | Portugal | Penafiel | 20.8 | 30.3 | 35.9 |
ROS | Romania | Satu Mare | 34.1 | 36.9 | 83.6 |
RAR | Russia | Armavir | 21.9 | 70.8 | 31.6 |
KRA | Russia | Krasnodar | 31.2 | 15.4 | 50.3 |
SOC | Russia | Sochi | 45.3 | 103.8 | 60.8 |
TIK | Russia | Tikhoretsk | 0.3 | 75.9 | 116.2 |
SLO | Slovenia | Ajdovscina | 14.7 | 2.5 | 31.8 |
SPB | Spain | Badajoz | 4.3 | 56.1 | 58.8 |
BAR | Spain | Barcelona | 20.3 | 1.7 | 61.0 |
SPC | Spain | Catarroja | 46.3 | 99.8 | 116.1 |
SPS | Spain | San Roque | 48.9 | 325.9 | 289.3 |
TAI | Taiwan | Tainan | 9.9 | 135.8 | 149.1 |
CHA | Thailand | Chanthaburi | 8.4 | 84.4 | 2.0 |
KAC | Thailand | Kanchanaburi | 30.1 | 49.9 | 45.4 |
LAM | Thailand | Lampang | 4.5 | 225.8 | 159.3 |
SSK | Thailand | Sisaket | 11.6 | 84.6 | 40.7 |
TUA | Turkey | Aliaga | 75.4 | 154.6 | 69.3 |
TUH | Turkey | Hopa | 5.8 | 0.3 | 65.2 |
BER | USA | Berlin, NJ | 0.3 | 33.5 | 33.6 |
PAL | USA | Palm Beach | 13.2 | 66.9 | 0.3 |
ALU | Ukraine | Alushta | 9.9 | 5.0 | 54.2 |
KER | Ukraine | Kerch, Crimea | 5.6 | 66.2 | 61.2 |
SEV | Ukraine | Sevastopol, Crimea | 5.1 | 70.8 | 66.0 |
HOC | Vietnam | Ho Chi Minh City | 54.6 | 127.4 | 85.0 |
QNC | Vietnam | Quy Nhon City | 3.4 | 82.5 | 6.6 |
# Create a new Word document
doc <- read_docx()
# Add the flextable
doc <- body_add_flextable(doc, value = ft)
# Save the document to a file
# Define the path for saving the Word document
file_path <- here("euro_global/output/snps_sets/linkage/decay_albopictus.docx")
print(doc, target = file_path)
Get Counts
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3/ld1.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
##
## ALD ALU ALV ARM BAR BEN BER BRE BUL CAM CES CHA CRO DES FRS GES GRA GRC GRV HAI
## 10 12 12 10 12 12 12 13 10 12 14 12 12 16 12 12 11 10 12 12
## HOC HUN INJ ITP ITR KAC KAG KAN KAT KER KRA LAM MAL MAT OKI PAL POP QNC RAR REC
## 7 12 11 9 12 6 12 11 6 12 12 9 12 12 12 11 12 11 12 11
## ROS SEV SIC SLO SOC SPB SPC SPS SSK STS SUF SUU TAI TIK TRE TUA TUH UTS YUN
## 11 12 9 12 12 8 6 8 12 12 6 6 7 12 12 9 12 12 9
Merge the data
# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")
# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")
Check correlation
# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
geom_point() +
geom_smooth(method = "lm", col = "orange") +
ggtitle("Correlation between Count and Chr1") +
labs(x = "Count", y = "LD Half Distance - Chr1")
# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
geom_point() +
geom_smooth(method = "lm", col = "green3") +
ggtitle("Correlation between Count and Chr2") +
labs(x = "Count", y = "LD Half Distance - Chr2")
# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Correlation between Count and Chr3") +
labs(x = "Count", y = "LD Half Distance - Chr3")
# Display the plots
print(plot_Chr1)
print(plot_Chr2)
print(plot_Chr3)
Calculate correlations
# For Chr1
model_Chr1 <- lm(Chr1 ~ Count, data = merged_data)
summary_Chr1 <- summary(model_Chr1)
R2_Chr1 <- summary_Chr1$r.squared
p_value_Chr1 <- coef(summary_Chr1)[2,4]
# For Chr2
model_Chr2 <- lm(Chr2 ~ Count, data = merged_data)
summary_Chr2 <- summary(model_Chr2)
R2_Chr2 <- summary_Chr2$r.squared
p_value_Chr2 <- coef(summary_Chr2)[2,4]
# For Chr3
model_Chr3 <- lm(Chr3 ~ Count, data = merged_data)
summary_Chr3 <- summary(model_Chr3)
R2_Chr3 <- summary_Chr3$r.squared
p_value_Chr3 <- coef(summary_Chr3)[2,4]
# Display R2 and p-values
cat("For Chr1: R2 =", R2_Chr1, ", p-value =", p_value_Chr1, "\n")
## For Chr1: R2 = 0.06363568 , p-value = 0.05608641
Correlation for chromosome 2
## For Chr2: R2 = 0.1866813 , p-value = 0.0007071597
Correlation for chromosome 3
## For Chr3: R2 = 0.300802 , p-value = 8.30986e-06
Annotate the plots
# Helper function to extract coefficients, R2, and create the label
get_annotation <- function(model) {
coefs <- coef(model)
eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
return(paste(eq, r2, sep = "\n"))
}
# Annotations for each chromosome
annotate_Chr1 <- get_annotation(model_Chr1)
annotate_Chr2 <- get_annotation(model_Chr2)
annotate_Chr3 <- get_annotation(model_Chr3)
# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "orange") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot1 <- plot_with_annotation(merged_data, "Chr1", annotate_Chr1)
# Display the plots with annotations
print(plot1)
## `geom_smooth()` using formula = 'y ~ x'
# Plot with annotations
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "green3") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot2 <- plot_with_annotation(merged_data, "Chr2", annotate_Chr2)
print(plot2)
## `geom_smooth()` using formula = 'y ~ x'
plot_with_annotation <- function(data, yvar, label) {
ggplot(data, aes(x = Count, y = !!sym(yvar))) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
annotate("text", x = max(data$Count) * 0.7, y = max(data[[yvar]]) * 0.7, label = label, hjust = 0) +
labs(x = "Count", y = yvar)
}
plot3 <- plot_with_annotation(merged_data, "Chr3", annotate_Chr3)
print(plot3)
## `geom_smooth()` using formula = 'y ~ x'
# Read the data
library(data.table)
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "City", "Count"), variable.name = "Chromosome")
saveRDS(merged_data_long, here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long.rds"
))
Create a facet plot
merged_data_long <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long.rds"))
# Function to compute the linear model details
get_lm_details <- function(data, yvar) {
model <- lm(data[[yvar]] ~ data$Count)
coefs <- coef(model)
eq <- sprintf("y = %.2fx + %.2f", coefs[2], coefs[1])
r2 <- sprintf("R^2 = %.2f", summary(model)$r.squared)
return(tibble(Chromosome = yvar, Equation = eq, R2 = r2))
}
# Compute details for each chromosome and bind rows
annotations <- bind_rows(get_lm_details(merged_data, "Chr1"),
get_lm_details(merged_data, "Chr2"),
get_lm_details(merged_data, "Chr3"))
# Compute maximum y for annotations for each chromosome
annotations <- annotations %>%
left_join(merged_data_long %>% group_by(Chromosome) %>% summarise(MaxY = max(value, na.rm = TRUE)), by = "Chromosome")
# Plotting with annotations correctly positioned in the top right corner
facet_plot <- ggplot(merged_data_long, aes(x = Count, y = value)) +
geom_point() +
geom_smooth(method = "lm", col = "black") +
geom_text(data = annotations, aes(label = paste(Equation, R2, sep = "\n"), y = MaxY),
x = max(merged_data$Count), hjust = 1, vjust = 1) +
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
labs(x = "Number of Samples", y = "LD Half Distance (kb)") +
my_theme()
print(facet_plot)
## `geom_smooth()` using formula = 'y ~ x'
# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_sample_size.pdf"), width = 4, height = 6)
Plot all countries
# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
group_by(Country, Chromosome) %>%
summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = value)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(aes(color = Country), width = 0.2) +
geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 2) + # Annotate mean value
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
theme_minimal() +
labs(title = "", y = "LD Half Distance (kb)") +
my_theme() +
coord_flip() +
theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none") +
theme(axis.text.y = element_text(size = 5, angle = 0, hjust = 1))
# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_mean_by_country.pdf"), width = 6, height = 10)
## [1] "Albania" "Ukraine" "Armenia" "Spain" "India" "USA"
## [7] "Italy" "Bulgaria" "Cambodia" "Thailand" "Croatia" "France"
## [13] "Georgia" "Greece" "Brazil" "China" "Vietnam" "Indonesia"
## [19] "Japan" "Russia" "Malta" "Malaysia" "Portugal" "Romania"
## [25] "Slovenia" "Taiwan" "Turkey"
Use ggstatplot to test if there is any significant differences Chr1
library(ggstatsplot)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr1, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr1,
title = "Chr1",
xlab = "Country",
ylab = "LD Half Distance (kb)",
#centrality.plotting = FALSE,
#centrality.point.args = list(size = 3, color = "darkred"),
#centrality.label.args = NULL,
#package = "RColorBrewer",
#palette = "Dark2",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_chr1_test.pdf"), width = 10, height = 8)
Chr2
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr2, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr2,
title = "Chr2",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr2_test.pdf"), width = 10, height =8)
Chr3
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr3, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr3,
title = "Chr3",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
We can start with file7 (copied to the new linkage directory), because in the QC Markdown we created this file using the chromosomal scale, and it already has been filtered for relatedness, heterozygosity, missingness, etc.
List pops
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 16
## FRS 12
## GEL 2
## GES 12
## GRA 11
## GRC 10
## GRV 12
## HAI 12
## HAN 4
## HOC 7
## HUN 12
## IMP 4
## INJ 11
## INW 4
## ITB 5
## ITP 9
## ITR 12
## JAF 2
## KAC 6
## KAG 12
## KAN 11
## KAT 6
## KER 12
## KLP 4
## KRA 12
## KUN 4
## LAM 9
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POL 2
## POP 12
## QNC 11
## RAR 12
## REC 11
## ROM 4
## ROS 11
## SER 4
## SEV 12
## SIC 9
## SLO 12
## SOC 12
## SON 3
## SPB 8
## SPC 6
## SPM 5
## SPS 8
## SSK 12
## STS 12
## SUF 6
## SUU 6
## TAI 7
## TIK 12
## TIR 4
## TRE 12
## TUA 9
## TUH 12
## UTS 12
## YUN 9
List pops with 10 or more
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 10 {print}'
## ALD 10
## ALU 12
## ALV 12
## ARM 10
## BAR 12
## BEN 12
## BER 12
## BRE 13
## BUL 10
## CAM 12
## CES 14
## CHA 12
## CRO 12
## DES 16
## FRS 12
## GES 12
## GRA 11
## GRC 10
## GRV 12
## HAI 12
## HUN 12
## INJ 11
## ITR 12
## KAG 12
## KAN 11
## KER 12
## KRA 12
## MAL 12
## MAT 12
## OKI 12
## PAL 11
## POP 12
## QNC 11
## RAR 12
## REC 11
## ROS 11
## SEV 12
## SLO 12
## SOC 12
## SSK 12
## STS 12
## TIK 12
## TRE 12
## TUH 12
## UTS 12
We can use the command above but only print the list of families, and create a new file (pops_4ld_2.txt) for the LD analysis.
Create list of populations
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/file7.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 10 {print}' | awk '{print $1}' > output/snps_sets/linkage/pops_4ld_2.txt;
head -n100 output/snps_sets/linkage/pops_4ld_2.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## GRV
## HAI
## HUN
## INJ
## ITR
## KAG
## KAN
## KER
## KRA
## MAL
## MAT
## OKI
## PAL
## POP
## QNC
## RAR
## REC
## ROS
## SEV
## SLO
## SOC
## SSK
## STS
## TIK
## TRE
## TUH
## UTS
We also want to remove American pops (just want native and Europe pops) Create a list of individuals we removed
echo 'BER
GRV
PAL
REC' > /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/populations_to_remove.txt
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage
input_file="pops_4ld_2.txt"
pops_to_remove_file="populations_to_remove.txt"
temp_file=$(mktemp)
grep -vFf "$pops_to_remove_file" "$input_file" > "$temp_file"
mv "$temp_file" "$input_file"
Check
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## GRV
## HAI
## HUN
## INJ
## ITR
## KAG
## KAN
## KER
## KRA
## MAL
## MAT
## OKI
## PAL
## POP
## QNC
## RAR
## REC
## ROS
## SEV
## SLO
## SOC
## SSK
## STS
## TIK
## TRE
## TUH
## UTS
Now remove random individuals for pops with >10 individuals, so that all pops have only 10. We also want to save a list of the individuals that we are removing.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage
input_file="file7.fam"
num_individuals=10
temp_file=$(mktemp)
removed_ind_file="removed_ind.txt" ## Specify the file to save removed individuals
shuf "$input_file" > "$temp_file"
#awk -v num="$num_individuals" '{a[$1]++} a[$1]<=num' "$temp_file" > "10ind_per_pop.txt"
awk -v num="$num_individuals" '{
if (!(a[$1]++ >= num)) {
print $0
} else {
print $0 >> "'"$removed_ind_file"'"
}
}' "$temp_file" > "10ind_per_pop.txt"
rm "$temp_file"
Check output file
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
awk '{print $1}' output/snps_sets/linkage/10ind_per_pop.txt | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 10 {print}'
## ALD 10
## ALU 10
## ALV 10
## ARM 10
## BAR 10
## BEN 10
## BER 10
## BRE 10
## BUL 10
## CAM 10
## CES 10
## CHA 10
## CRO 10
## DES 10
## FRS 10
## GES 10
## GRA 10
## GRC 10
## GRV 10
## HAI 10
## HUN 10
## INJ 10
## ITR 10
## KAG 10
## KAN 10
## KER 10
## KRA 10
## MAL 10
## MAT 10
## OKI 10
## PAL 10
## POP 10
## QNC 10
## RAR 10
## REC 10
## ROS 10
## SEV 10
## SLO 10
## SOC 10
## SSK 10
## STS 10
## TIK 10
## TRE 10
## TUH 10
## UTS 10
To actually remove the individuals not in the “10ind_per_pop.txt” file from the bed file, use the “removed_ind_file” file with the list of individuals that we removed
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
plink2 \
--allow-extra-chr \
--remove output/snps_sets/linkage/removed_ind.txt \
--bfile output/snps_sets/linkage/file7 \
--make-bed \
--out output/snps_sets/linkage/ld2 \
--silent;
grep 'samples\|variants\|remaining' output/snps_sets/linkage/ld2.log
688 samples (82 females, 64 males, 542 ambiguous; 688 founders) loaded from 87183 variants loaded from output/snps_sets/linkage/file7.bim. –remove: 606 samples remaining. 606 samples (72 females, 58 males, 476 ambiguous; 606 founders) remaining after
We will use MAF 0.005 for each chromosome (default from PopLDdecay) Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3733605 199.4 7322801 391.1 7322801 391.1
## Vcells 7645024 58.4 30092465 229.6 47019323 358.8
We can estimate LD for each chromosome separated.
Import the .bim file with the SNPs to create a new chromosomal scale. Now use ld1 (rather than file7) because it has pruned the individuals so no pop has >10 individuals
snps <-
read_delim( # to learn about the options use here, run ?read_delim on the console.
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/ld2.bim"
), # use library here to load it
col_names = FALSE, # we don't have header in the input file
show_col_types = FALSE, # suppress message from read_delim
col_types = "ccidcc" # set the class of each column
)
#
# set column names
colnames(
snps
) <- # to add a header in our tibble
c(
"Chromosome", "SNP", "Cm", "Position", "Allele1", "Allele2"
)
#
# check the tibble
head(snps)
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583035163 0 315386 A G
## 2 1 AX-583033356 0 315674 C T
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035257 0 442875 T C
Separate the tibbles into each chromosome.
# chr1
chr1_snps <-
snps |>
filter(Chromosome == 1) |> # here we get only Chromosome rows starting with 1
as_tibble() # save as tibble
#
# chr2
chr2_snps <-
snps |>
filter(Chromosome == 2) |>
as_tibble()
#
# chr3
chr3_snps <-
snps |>
filter(Chromosome == 3) |>
as_tibble()
We have objects with the SNPs for each chromosome
## # A tibble: 6 × 6
## Chromosome SNP Cm Position Allele1 Allele2
## <chr> <chr> <int> <dbl> <chr> <chr>
## 1 1 AX-583035163 0 315386 A G
## 2 1 AX-583033356 0 315674 C T
## 3 1 AX-583033370 0 330057 G A
## 4 1 AX-583035194 0 330265 A G
## 5 1 AX-583033387 0 331288 C T
## 6 1 AX-583035257 0 442875 T C
Filter the data by chromosome Chromosome 1
write.table(
chr1_snps$SNP,
file = here(
"euro_global/output/snps_sets/linkage/chr1_ld2_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Chromosome 2
write.table(
chr2_snps$SNP,
file = here(
"euro_global/output/snps_sets/linkage/chr2_ld2_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Chromosome 3
write.table(
chr3_snps$SNP,
file = here(
"euro_global/output/snps_sets/linkage/chr3_ld2_SNPs.txt"
),
sep = "\t",
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
Now create new files for each chromosome
cd /gpfs/gibbs/pi/caccone/mkc54/albo
mkdir -p euro_global/output/snps_sets/linkage/chr1_2;
mkdir -p euro_global/output/snps_sets/linkage/chr2_2;
mkdir -p euro_global/output/snps_sets/linkage/chr3_2;
Chromosome 1
cd /gpfs/gibbs/pi/caccone/mkc54/albo
plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/linkage/ld2 \
--keep-fam euro_global/output/snps_sets/linkage/pops_4ld_2.txt \
--extract euro_global/output/snps_sets/linkage/chr1_ld2_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out euro_global/output/snps_sets/linkage/chr1_2/ld2 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/snps_sets/linkage/chr1_2/ld2.log
606 samples (72 females, 58 males, 476 ambiguous; 606 founders) loaded from 87183 variants loaded from euro_global/output/snps_sets/linkage/ld1.bim. –extract: 19362 variants remaining. –keep-fam: 410 samples remaining. 410 samples (35 females, 41 males, 334 ambiguous; 410 founders) remaining after 0 variants removed due to allele frequency threshold(s) 19362 variants remaining after main filters.
Chromosome 2
plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/linkage/ld2 \
--keep-fam euro_global/output/snps_sets/linkage/pops_4ld_2.txt \
--extract euro_global/output/snps_sets/linkage/chr2_ld2_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out euro_global/output/snps_sets/linkage/chr2_2/ld2 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/snps_sets/linkage/chr2_2/ld2.log
606 samples (72 females, 58 males, 476 ambiguous; 606 founders) loaded from 87183 variants loaded from euro_global/output/snps_sets/linkage/ld1.bim. –extract: 36611 variants remaining. –keep-fam: 410 samples remaining. 410 samples (35 females, 41 males, 334 ambiguous; 410 founders) remaining after 0 variants removed due to allele frequency threshold(s) 36611 variants remaining after main filters.
Chromosome 3
plink2 \
--allow-extra-chr \
--bfile euro_global/output/snps_sets/linkage/ld2 \
--keep-fam euro_global/output/snps_sets/linkage/pops_4ld_2.txt \
--extract euro_global/output/snps_sets/linkage/chr3_ld2_SNPs.txt \
--make-bed \
--maf 0.01 \
--export vcf \
--out euro_global/output/snps_sets/linkage/chr3_2/ld2 \
--silent;
grep 'samples\|variants\|remaining' euro_global/output/snps_sets/linkage/chr3_2/ld2.log
606 samples (72 females, 58 males, 476 ambiguous; 606 founders) loaded from 87183 variants loaded from euro_global/output/snps_sets/linkage/ld1.bim. –extract: 31210 variants remaining. –keep-fam: 410 samples remaining. 410 samples (35 females, 41 males, 334 ambiguous; 410 founders) remaining after 0 variants removed due to allele frequency threshold(s) 31210 variants remaining after main filters.
Check the vcfs
## /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/ld2.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/ld2.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.vcf
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3733774 199.5 7322801 391.1 7322801 391.1
## Vcells 7641871 58.4 30092465 229.6 47019323 358.8
Prepare the files required by PopLDdecay
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
bcftools query -l output/snps_sets/linkage/chr1_2/ld2.vcf > output/snps_sets/linkage/vcf_samples.txt
bcftools query -l output/snps_sets/linkage/chr1_2/ld2.vcf | cut -d'_' -f1 | sort | uniq > output/snps_sets/linkage/unique_pops_from_vcf.txt
while read pop; do
grep "^${pop}_" output/snps_sets/linkage/vcf_samples.txt > output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr1_2/ld2.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr1_2/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/snps_sets/linkage/chr1_2/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr1_2/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr1_2/ld_decay_results_list.txt
Exit the conda environment and load R
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
-in output/snps_sets/linkage/chr1_2/ld_decay_results_list.txt \
-output output/snps_sets/linkage/chr1_2/all_pops \
-bin1 10 \
-bin2 100 \
-break 100 \
-maxX 1000 \
-measure r2 \
-method MeanBin \
-keepR
library(ggrepel)
library(RColorBrewer)
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/ld2.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
library(scales)
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr1_2.pdf"),
width = 14,
height = 16,
units = "in"
)
Save the data to use later
saveRDS(half_distances, here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/chr1.rds"
))
Load the data for chrom 1
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4139154 221.1 7322801 391.1 7322801 391.1
## Vcells 11408873 87.1 38032589 290.2 47019323 358.8
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr2_2/ld2.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr2_2/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/snps_sets/linkage/chr2_2/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr2_2/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr2_2/ld_decay_results_list.txt
Exit the conda environment and load R
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
-in output/snps_sets/linkage/chr2_2/ld_decay_results_list.txt \
-output output/snps_sets/linkage/chr2_2/all_pops \
-bin1 10 \
-bin2 100 \
-break 100 \
-maxX 1000 \
-measure r2 \
-method MeanBin \
-keepR
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/ld2.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
#p
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr2_2.pdf"),
width = 14,
height = 16,
units = "in"
)
Note: This produces a very big image, so you need to expand the viewer as large as possible to avoid getting an error
Save the data to use later
saveRDS(half_distances, here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/chr2.rds"
))
Load the data for chrom 2
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4139267 221.1 7322801 391.1 7322801 391.1
## Vcells 11410075 87.1 38032589 290.2 47019323 358.8
module load miniconda
conda activate PopLDdecay-env
export PATH=$PATH:/gpfs/gibbs/project/caccone/mkc54/PopLDdecay/PopLDdecay/bin
#module spider perl
module load Perl/5.36.1-GCCcore-12.2.0
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
while read pop; do
PopLDdecay -InVCF output/snps_sets/linkage/chr3_2/ld2.vcf \
-OutType 4 \
-MaxDist 1000 \
-MAF 0.01 \
-OutFilterSNP \
-OutStat output/snps_sets/linkage/chr3_2/${pop}_LDdecay.gz \
-SubPop output/snps_sets/linkage/${pop}_samples.txt
done < output/snps_sets/linkage/unique_pops_from_vcf.txt
Get the files we need
for file in output/snps_sets/linkage/chr3_2/*_LDdecay.stat.gz; do
pop_name=$(basename $file _LDdecay.stat.gz)
echo "$file $pop_name"
done > output/snps_sets/linkage/chr3_2/ld_decay_results_list.txt
head -100 output/snps_sets/linkage/chr3_2/ld_decay_results_list.txt
Exit the conda environment and load R
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
module load Perl/5.36.1-GCCcore-12.2.0
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl --help
Plot using PopLDdecay. It will create files that we can use to plot with ggplot
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global
perl /gpfs/gibbs/project/caccone/mkc54/PopLDdecay/bin/Plot_MultiPop.pl \
-in output/snps_sets/linkage/chr3_2/ld_decay_results_list.txt \
-output output/snps_sets/linkage/chr3_2/all_pops \
-bin1 10 \
-bin2 100 \
-break 100 \
-maxX 1000 \
-measure r2 \
-method MeanBin \
-keepR
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Extract the family names from the first column
populations <- unique(fam_data$V1)
# Determine the number of unique populations
num_populations <- length(unique(populations))
# Fetch colors accordingly
colors_set3 <- brewer.pal(min(12, num_populations), "Set3")
colors_pastel2 <- brewer.pal(min(8, num_populations - length(colors_set3)), "Pastel2")
colors_paired <- brewer.pal(min(12, num_populations - length(colors_set3) - length(colors_pastel2)), "Paired")
# Combine the colors
colors <- c(colors_set3, colors_pastel2, colors_paired)
# Ensure we're only taking as many colors as there are populations
colors <- colors[1:num_populations]
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Initialize an empty list to store data frames
data_frames <- list()
# Read data from each population file and store it in the list
for (pop in populations) {
file_path <- paste("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/all_pops.", pop, sep = "")
data <- read.table(file_path)
data$population <- pop # Add a column for population name
data_frames[[pop]] <- data
}
# Combine all data frames into a single data frame
combined_data <- do.call(rbind, data_frames)
# Create a named vector for populations and their colors
color_mapping <- setNames(colors, populations)
# Calculate half distances
half_distances <- combined_data |>
group_by(population) |>
mutate(max_r2 = max(V2)) |>
filter(V2 <= max_r2 / 2) |>
arrange(V1) |>
slice(1) |>
select(population, half_distance = V1)
# Calculate the maximum V2 value for each population
max_values <- combined_data |>
group_by(population) |>
summarize(max_V2 = max(V2))
# Merge the maximum V2 values with the half_distances dataframe
half_distances <- merge(half_distances, max_values, by = "population")
# Create the ggplot
p <- ggplot(combined_data, aes(x = V1 / 1000, y = V2, group = population, color = population)) +
geom_point(size = .3, alpha = 0.1) +
geom_smooth(method = "loess", se = FALSE, span = 0.3, color = "black", linewidth = 0.5) +
geom_vline(data = half_distances, aes(xintercept = half_distance / 1000), color = "blue", linetype = "dashed") +
geom_text_repel(data = half_distances, aes(x = half_distance / 1000 + 130, y = max_V2, label = paste0(round(half_distance / 1000, 1), "Kb")),
color = "blue", vjust = -1, size = 3) +
labs(
title = "",
x = "Distance(Kb)",
y = expression(r^2)
) +
scale_color_manual(values = color_mapping) +
scale_y_continuous(labels = label_comma(accuracy = 0.01)) +
scale_x_continuous(labels = label_comma()) +
theme_minimal() +
theme(
legend.position = "none",
axis.title.x = element_text(face = "bold", size = 14),
axis.title.y = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold")
) +
facet_wrap(~ population, scales = "free_y", ncol = 2) +
coord_cartesian(xlim = c(0, 1000))
#p
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr3_2.pdf"),
width = 14,
height = 16,
units = "in"
)
ggsave(
filename = here("scripts", "RMarkdowns", "output", "euro_global", "ld", "decay_chr3_2.png"),
width = 14,
height = 16,
units = "in",
bg = "white"
)
Save the data to use later
saveRDS(half_distances, here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/chr3.rds"
))
Load the data for chrom 3
Clean env and memory
# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4139392 221.1 7322801 391.1 7322801 391.1
## Vcells 11411216 87.1 38032589 290.2 47019323 358.8
Create data frame with the data from each chromosome
Import the data
# Chromosome 1
chr1 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr1_2/chr1.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr1 = half_distance
) |>
mutate(chr1 = chr1/1000)
# Chromosome 2
chr2 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr2_2/chr2.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr2 = half_distance
) |>
mutate(chr2 = chr2/1000)
# Chromosome 3
chr3 <- readRDS(here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/chr3.rds"
)) |> dplyr::select(population, half_distance
) |> dplyr::rename(
chr3 = half_distance
) |>
mutate(chr3 = chr3/1000)
# Merge
distance <- merge(merge(chr1, chr2, by="population"), chr3, by="population")
# Convert data from wide to long format
data_long <- gather(distance, key = "chr", value = "value", -population)
# Make it capital
data_long$chr <- tools::toTitleCase(data_long$chr)
# Remove KAT because it has mosquitoes collected from different source and had higher linkage due it
data_long <- subset(data_long, population != "KAT")
Make one plot
# Define a custom color palette
custom_palette <- c(
"Chr1" = "orange",
"Chr2" = "green3",
"Chr3" = "blue"
)
# Reordering the levels of the chr factor
data_long$chr <- factor(data_long$chr, levels = c("Chr3", "Chr2", "Chr1"))
# source the plotting function
source(here("analyses", "my_theme2.R"))
# Plotting half_distance with borders and spaced bars
ggplot(data_long, aes(x = population, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
my_theme() +
coord_flip() +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"))
Save
ggsave("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/ld/decay_comparison_2.pdf", width = 8, height = 14)
Import sample locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
## population chr value
## 1 ALD Chr1 40.7
## 2 ALU Chr1 5.6
## 3 ALV Chr1 19.5
## 4 ARM Chr1 14.2
## 5 BAR Chr1 20.3
## 6 BEN Chr1 18.9
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
## population chr value Pop_City Country Latitude Longitude Continent Year
## 1 ALD Chr1 40.7 Durres Albania 41.29704 19.50373 Europe 2018
## 2 ALU Chr1 5.6 Alushta Ukraine 44.68289 34.40368 Europe 2021
## 3 ALV Chr1 19.5 Vlore Albania 40.46600 19.48970 Europe 2020
## 4 ARM Chr1 14.2 Ijevan Armenia 40.87971 45.14764 Europe 2020
## 5 BAR Chr1 20.3 Barcelona Spain 41.38510 2.17340 Europe 2018
## 6 BEN Chr1 18.9 Bengaluru India 12.97160 77.59460 Asia Unknown
## Region Subregion order
## 1 Southern Europe East Europe 25
## 2 Eastern Europe East Europe 35
## 3 Southern Europe East Europe 24
## 4 Eastern Europe East Europe 42
## 5 Southern Europe West Europe 8
## 6 South Asia 52
Add the name of the city/countries to the plot
# Creating the label with population, city, and country for the y-axis
distance2$pop_city_country_label <- paste(distance2$population, "\n", distance2$Pop_City, "\n(", distance2$Country, ")", sep = "")
# Sorting by Country, then City, and then by Population to ensure populations from the same country (and city) are plotted together
distance2 <- distance2 %>% arrange(Country, Pop_City, population)
# Adjusting the factor levels for plotting in the desired order
distance2$pop_city_country_label <- fct_inorder(distance2$pop_city_country_label)
# Plotting the data
ggplot(distance2, aes(x = pop_city_country_label, y = value, fill = chr)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black", width = 0.7) +
labs(title = "", y = "Half Distance (kb)", x = "") +
geom_text(aes(label = round(value, 0)), position = position_dodge(width = 0.8), vjust = 0.5, hjust = -0.2, size = 2.5) +
my_theme() +
coord_flip() +
labs(title = "", y = "Half Distance (kb)", x = "Population") +
scale_fill_manual(values = custom_palette, name = "Chromosome", breaks = c("Chr1", "Chr2", "Chr3")) +
theme(legend.position = "top",
plot.margin = margin(t = 10, r = 30, b = 10, l = 10, unit = "pt"),
axis.text.y = element_text(size = 5, angle = 0, hjust = 1))
Save
ggsave("scripts/RMarkdowns/output/euro_global/ld/decay_comp_poplabels_2.pdf", width = 8, height = 14)
# Create table
dist_table <- distance2 |>
dplyr::select(
population, Pop_City, Country, chr, value
) |>
dplyr::rename(
City = Pop_City,
Population = population
)
# Spread and arrange
dist_table_condensed <- dist_table %>%
spread(key = chr, value = value) %>%
dplyr::select(Population, Country, City, Chr1, Chr2, Chr3) |>
dplyr::arrange(Country, City)
# Create table
ft <- flextable::flextable(dist_table_condensed)
# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)
# Show it
ft
Population | Country | City | Chr1 | Chr2 | Chr3 |
---|---|---|---|---|---|
ALD | Albania | Durres | 40.7 | 72.3 | 47.2 |
ALV | Albania | Vlore | 19.5 | 2.5 | 81.2 |
ARM | Armenia | Ijevan | 14.2 | 53.4 | 37.8 |
BUL | Bulgaria | Lom | 57.9 | 95.5 | 57.0 |
CAM | Cambodia | Phnom Penh | 9.9 | 216.9 | 4.6 |
HAI | China | Hainan | 29.5 | 71.5 | 35.6 |
HUN | China | Hunan | 21.8 | 3.6 | 60.6 |
CRO | Croatia | Dubrovnik | 49.9 | 1.6 | 85.0 |
FRS | France | Saint-Martin-d'Heres | 3.1 | 25.5 | 70.5 |
STS | France | Strasbourg | 26.6 | 74.2 | 88.6 |
GES | Georgia | Sakhumi, Abkhazia | 5.6 | 109.6 | 79.1 |
GRA | Greece | Athens | 19.5 | 3.3 | 0.5 |
GRC | Greece | Chania | 62.0 | 141.4 | 87.9 |
BEN | India | Bengaluru | 18.9 | 112.6 | 53.2 |
INJ | Indonesia | Jakarta | 8.0 | 107.0 | 65.4 |
BRE | Italy | Brescia | 21.3 | 57.0 | 72.0 |
CES | Italy | Cesena | 20.3 | 3.5 | 44.2 |
DES | Italy | Desenzano | 5.6 | 64.5 | 108.7 |
ITR | Italy | Rome (Trappola) | 16.1 | 57.4 | 36.8 |
TRE | Italy | Trentino | 12.2 | 42.6 | 56.4 |
KAG | Japan | Kagoshima | 28.0 | 30.3 | 35.6 |
KAN | Japan | Kanazawa | 23.0 | 41.0 | 39.1 |
OKI | Japan | Okinawa | 9.1 | 20.4 | 161.5 |
UTS | Japan | Utsunomiya | 35.4 | 13.2 | 32.2 |
MAT | Malaysia | Tambun | 11.4 | 246.1 | 104.3 |
MAL | Malta | Luqa | 25.3 | 53.4 | 81.4 |
POP | Portugal | Penafiel | 48.7 | 39.1 | 57.0 |
ROS | Romania | Satu Mare | 13.6 | 69.0 | 51.4 |
RAR | Russia | Armavir | 24.8 | 110.3 | 88.5 |
KRA | Russia | Krasnodar | 37.0 | 1.8 | 99.6 |
SOC | Russia | Sochi | 34.1 | 129.5 | 82.6 |
TIK | Russia | Tikhoretsk | 5.6 | 165.6 | 155.7 |
SLO | Slovenia | Ajdovscina | 20.3 | 53.4 | 77.7 |
BAR | Spain | Barcelona | 20.3 | 1.7 | 75.2 |
CHA | Thailand | Chanthaburi | 12.3 | 129.5 | 24.5 |
SSK | Thailand | Sisaket | 12.3 | 39.4 | 40.7 |
TUH | Turkey | Hopa | 20.3 | 0.3 | 81.2 |
ALU | Ukraine | Alushta | 5.6 | 61.3 | 64.4 |
KER | Ukraine | Kerch, Crimea | 5.6 | 110.9 | 92.5 |
SEV | Ukraine | Sevastopol, Crimea | 5.1 | 70.8 | 66.0 |
QNC | Vietnam | Quy Nhon City | 3.4 | 82.5 | 6.6 |
library(flextable)
library(officer)
# Create a new Word document
doc <- read_docx()
# Add the flextable
doc <- body_add_flextable(doc, value = ft)
# Save the document to a file
# Define the path for saving the Word document
file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_albopictus_2.docx")
print(doc, target = file_path)
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
##
## ALD ALU ALV ARM BAR BEN BRE BUL CAM CES CHA CRO DES FRS GES GRA GRC HAI HUN INJ
## 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## ITR KAG KAN KER KRA MAL MAT OKI POP QNC RAR ROS SEV SLO SOC SSK STS TIK TRE TUH
## 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## UTS
## 10
Merge the data
# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")
# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")
Check to make sure all pops have 10 for each chromosome
# Chr1 vs Count
plot_Chr1 <- ggplot(merged_data, aes(x = Count, y = Chr1)) +
geom_point() +
geom_smooth(method = "lm", col = "orange") +
ggtitle("Correlation between Count and Chr1") +
labs(x = "Count", y = "LD Half Distance - Chr1")
# Chr2 vs Count
plot_Chr2 <- ggplot(merged_data, aes(x = Count, y = Chr2)) +
geom_point() +
geom_smooth(method = "lm", col = "green3") +
ggtitle("Correlation between Count and Chr2") +
labs(x = "Count", y = "LD Half Distance - Chr2")
# Chr3 vs Count
plot_Chr3 <- ggplot(merged_data, aes(x = Count, y = Chr3)) +
geom_point() +
geom_smooth(method = "lm", col = "blue") +
ggtitle("Correlation between Count and Chr3") +
labs(x = "Count", y = "LD Half Distance - Chr3")
# Display the plots
print(plot_Chr1)
## `geom_smooth()` using formula = 'y ~ x'
There is no variation in the number of individuals, since we filtered out all pops with <10 and removed individuals for pops with >10.
# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
group_by(Country, Chromosome) %>%
summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Country, y = value)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(aes(color = Country), width = 0.2) +
geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 2) + # Annotate mean value
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
theme_minimal() +
labs(title = "", y = "LD Half Distance (kb)") +
my_theme() +
coord_flip() +
theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none") +
theme(axis.text.y = element_text(size = 5, angle = 0, hjust = 1))
## [1] "Albania" "Ukraine" "Armenia" "Spain" "India" "Italy"
## [7] "Bulgaria" "Cambodia" "Thailand" "Croatia" "France" "Georgia"
## [13] "Greece" "China" "Indonesia" "Japan" "Russia" "Malta"
## [19] "Malaysia" "Portugal" "Vietnam" "Romania" "Slovenia" "Turkey"
Use ggstatplot to test if there is any significant differences Chr1
library(ggstatsplot)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr1, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr1,
title = "Chr1",
xlab = "Country",
ylab = "LD Half Distance (kb)",
#centrality.plotting = FALSE,
#centrality.point.args = list(size = 3, color = "darkred"),
#centrality.label.args = NULL,
#package = "RColorBrewer",
#palette = "Dark2",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_chr1_test.pdf"), width = 10, height = 8)
Chr2
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr2, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr2,
title = "Chr2",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr2_test.pdf"), width = 10, height =8)
Chr3
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Country = fct_reorder(Country, Chr3, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Country,
y = Chr3,
title = "Chr3",
xlab = "Country",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
## Number of labels is greater than default palette color count.
## • Select another color `palette` (and/or `package`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr3_test.pdf"), width = 10, height = 8)
Add region to merged_data
Import sample locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
## population chr value
## 1 ALD Chr1 40.7
## 2 ALU Chr1 5.6
## 3 ALV Chr1 19.5
## 4 ARM Chr1 14.2
## 5 BAR Chr1 20.3
## 6 BEN Chr1 18.9
# Join with sampling_loc to get sampling localities
distance2 <- data_long |>
left_join(sampling_loc, by = c("population" = "Abbreviation"))
head(distance2)
## population chr value Pop_City Country Latitude Longitude Continent Year
## 1 ALD Chr1 40.7 Durres Albania 41.29704 19.50373 Europe 2018
## 2 ALU Chr1 5.6 Alushta Ukraine 44.68289 34.40368 Europe 2021
## 3 ALV Chr1 19.5 Vlore Albania 40.46600 19.48970 Europe 2020
## 4 ARM Chr1 14.2 Ijevan Armenia 40.87971 45.14764 Europe 2020
## 5 BAR Chr1 20.3 Barcelona Spain 41.38510 2.17340 Europe 2018
## 6 BEN Chr1 18.9 Bengaluru India 12.97160 77.59460 Asia Unknown
## Region Subregion order
## 1 Southern Europe East Europe 25
## 2 Eastern Europe East Europe 35
## 3 Southern Europe East Europe 24
## 4 Eastern Europe East Europe 42
## 5 Southern Europe West Europe 8
## 6 South Asia 52
# Create table
dist_table <- distance2 |>
dplyr::select(
population, Pop_City, Country, Continent, chr, value
) |>
dplyr::rename(
City = Pop_City,
Population = population
)
# Spread and arrange
dist_table_condensed <- dist_table %>%
spread(key = chr, value = value) %>%
dplyr::select(Population, City, Country, Continent, Chr1, Chr2, Chr3) |>
dplyr::arrange(Continent, Country, City)
# Create table
ft <- flextable::flextable(dist_table_condensed)
# Apply zebra striping with bg
ft <- flextable::theme_zebra(ft)
# Show it
ft
Population | City | Country | Continent | Chr1 | Chr2 | Chr3 |
---|---|---|---|---|---|---|
CAM | Phnom Penh | Cambodia | Asia | 9.9 | 216.9 | 4.6 |
HAI | Hainan | China | Asia | 29.5 | 71.5 | 35.6 |
HUN | Hunan | China | Asia | 21.8 | 3.6 | 60.6 |
BEN | Bengaluru | India | Asia | 18.9 | 112.6 | 53.2 |
INJ | Jakarta | Indonesia | Asia | 8.0 | 107.0 | 65.4 |
KAG | Kagoshima | Japan | Asia | 28.0 | 30.3 | 35.6 |
KAN | Kanazawa | Japan | Asia | 23.0 | 41.0 | 39.1 |
OKI | Okinawa | Japan | Asia | 9.1 | 20.4 | 161.5 |
UTS | Utsunomiya | Japan | Asia | 35.4 | 13.2 | 32.2 |
MAT | Tambun | Malaysia | Asia | 11.4 | 246.1 | 104.3 |
CHA | Chanthaburi | Thailand | Asia | 12.3 | 129.5 | 24.5 |
SSK | Sisaket | Thailand | Asia | 12.3 | 39.4 | 40.7 |
QNC | Quy Nhon City | Vietnam | Asia | 3.4 | 82.5 | 6.6 |
ALD | Durres | Albania | Europe | 40.7 | 72.3 | 47.2 |
ALV | Vlore | Albania | Europe | 19.5 | 2.5 | 81.2 |
ARM | Ijevan | Armenia | Europe | 14.2 | 53.4 | 37.8 |
BUL | Lom | Bulgaria | Europe | 57.9 | 95.5 | 57.0 |
CRO | Dubrovnik | Croatia | Europe | 49.9 | 1.6 | 85.0 |
FRS | Saint-Martin-d'Heres | France | Europe | 3.1 | 25.5 | 70.5 |
STS | Strasbourg | France | Europe | 26.6 | 74.2 | 88.6 |
GES | Sakhumi, Abkhazia | Georgia | Europe | 5.6 | 109.6 | 79.1 |
GRA | Athens | Greece | Europe | 19.5 | 3.3 | 0.5 |
GRC | Chania | Greece | Europe | 62.0 | 141.4 | 87.9 |
BRE | Brescia | Italy | Europe | 21.3 | 57.0 | 72.0 |
CES | Cesena | Italy | Europe | 20.3 | 3.5 | 44.2 |
DES | Desenzano | Italy | Europe | 5.6 | 64.5 | 108.7 |
ITR | Rome (Trappola) | Italy | Europe | 16.1 | 57.4 | 36.8 |
TRE | Trentino | Italy | Europe | 12.2 | 42.6 | 56.4 |
MAL | Luqa | Malta | Europe | 25.3 | 53.4 | 81.4 |
POP | Penafiel | Portugal | Europe | 48.7 | 39.1 | 57.0 |
ROS | Satu Mare | Romania | Europe | 13.6 | 69.0 | 51.4 |
RAR | Armavir | Russia | Europe | 24.8 | 110.3 | 88.5 |
KRA | Krasnodar | Russia | Europe | 37.0 | 1.8 | 99.6 |
SOC | Sochi | Russia | Europe | 34.1 | 129.5 | 82.6 |
TIK | Tikhoretsk | Russia | Europe | 5.6 | 165.6 | 155.7 |
SLO | Ajdovscina | Slovenia | Europe | 20.3 | 53.4 | 77.7 |
BAR | Barcelona | Spain | Europe | 20.3 | 1.7 | 75.2 |
TUH | Hopa | Turkey | Europe | 20.3 | 0.3 | 81.2 |
ALU | Alushta | Ukraine | Europe | 5.6 | 61.3 | 64.4 |
KER | Kerch, Crimea | Ukraine | Europe | 5.6 | 110.9 | 92.5 |
SEV | Sevastopol, Crimea | Ukraine | Europe | 5.1 | 70.8 | 66.0 |
library(officer)
library(flextable)
# Create a new Word document
doc <- read_docx()
# Add the flextable
doc <- body_add_flextable(doc, value = ft)
# Save the document to a file
# Define the path for saving the Word document
file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_albopictus_withCont_2.docx")
print(doc, target = file_path)
# Define the path to the .fam file using here
fam_file_path <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/chr3_2/ld2.fam")
# Read the .fam file
fam_data <- read.table(fam_file_path, header = FALSE, stringsAsFactors = FALSE)
# Get the number of individuals
individuals_count <- table(fam_data$V1)
individuals_count
##
## ALD ALU ALV ARM BAR BEN BRE BUL CAM CES CHA CRO DES FRS GES GRA GRC HAI HUN INJ
## 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## ITR KAG KAN KER KRA MAL MAT OKI POP QNC RAR ROS SEV SLO SOC SSK STS TIK TRE TUH
## 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
## UTS
## 10
Merge the data
# Convert individuals_count to a data frame
individuals_df <- as.data.frame(individuals_count)
colnames(individuals_df) <- c("Population", "Count")
# Merge with dist_table_condensed
merged_data <- merge(dist_table_condensed, individuals_df, by="Population")
# Read the data
library(data.table)
merged_data_long <- melt(setDT(merged_data), id.vars = c("Population","Country", "Continent", "City", "Count"), variable.name = "Chromosome")
saveRDS(merged_data_long, here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/merged_data_long_2.rds"
))
Calc mean countries
# Calculate mean for each Country and Chromosome
mean_data <- merged_data_long %>%
group_by(Continent, Country, Chromosome) %>%
summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
Plot by continent
# Jitter plot overlaid on boxplot for each chromosome by country without legend
ggplot(merged_data_long, aes(x = Continent, y = value)) +
geom_boxplot(alpha = 0.5) +
geom_jitter(aes(color = Continent), width = 0.2) +
geom_text(data = mean_data, aes(y = mean_value, label = round(mean_value, 2)), vjust = 0.5, hjust = -0.5, size = 2) + # Annotate mean value
facet_wrap(~ Chromosome, scales = "free_y", ncol = 1) +
theme_minimal() +
labs(title = "", y = "LD Half Distance (kb)") +
my_theme() +
coord_flip() +
theme(axis.text.x = element_text(angle = 0, hjust = 1), legend.position = "none") +
theme(axis.text.y = element_text(size = 5, angle = 0, hjust = 1))
# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_mean_by_continent_2.pdf"), width = 6, height = 10)
Use ggstatplot to test if there is any significant differences Chr1
library(ggstatsplot)
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Continent = fct_reorder(Continent, Chr1, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Continent,
y = Chr1,
title = "Chr1",
xlab = "Continent",
ylab = "LD Half Distance (kb)",
#centrality.plotting = FALSE,
#centrality.point.args = list(size = 3, color = "darkred"),
#centrality.label.args = NULL,
#package = "RColorBrewer",
#palette = "Dark2",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
# Save the plot
ggsave(here("scripts/RMarkdowns/output/euro_global/ld/decay_chr1_test_continent_2.pdf"), width = 10, height = 8)
Chr2
# Reorder the levels of Country based on the median of Chr1
#merged_data <- merged_data %>%
# mutate(Continent = fct_reorder(Continent, Chr2, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Continent,
y = Chr2,
title = "Chr2",
xlab = "Continent",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")
# Save the plot
ggsave(here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/linkage/decay_chr2_test_continent_2.pdf"), width = 10, height =8)
Chr3
# Reorder the levels of Country based on the median of Chr1
merged_data <- merged_data %>%
mutate(Continent = fct_reorder(Continent, Chr3, .fun = median))
# Plot
ggbetweenstats(
data = merged_data,
x = Continent,
y = Chr3,
title = "Chr3",
xlab = "Continent",
ylab = "LD Half Distance (kb)",
ggplot.component = list(theme(legend.position = "none"))
) + theme(axis.text.x = element_text(size = 5, angle = 0, hjust = 1), legend.position = "none")