R Markdown

Load Libraries

library(StAMPP)
## Loading required package: pegas
## Loading required package: ape
## 
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
## 
##     mst
## Registered S3 method overwritten by 'ade4':
##   method      from 
##   print.amova pegas
library(ggplot2)
library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.1     ✔ stringr 1.5.1
## ✔ readr   2.1.5     ✔ forcats 0.5.1
## ✔ purrr   1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::where()  masks ape::where()
library(adegenet)
## Loading required package: ade4
## 
## Attaching package: 'ade4'
## The following object is masked from 'package:pegas':
## 
##     amova
## 
##    /// adegenet 2.1.10 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
library(here)
## here() starts at /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns
library(flextable)
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
## 
##     compose
## The following object is masked from 'package:ape':
## 
##     rotate
library(officer)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(dplyr) 
library(tidyr)
library(geosphere)
library(flextable)
library(officer)
library(dartR) #do not load until Mantel test
## Loading required package: dartR.data
## **** Welcome to dartR.data [Version 1.0.8 ] ****
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Registered S3 method overwritten by 'genetics':
##   method      from 
##   [.haplotype pegas
## **** Welcome to dartR [Version 2.9.7 ] ****
## Be aware that owing to CRAN requirements and compatibility reasons not all functions of the package may run after the basic installation, as some packages could still be missing. Hence for a most enjoyable experience we recommend to run the function
## gl.install.vanilla.dartR()
## This installs all missing and required packages for your version of dartR. In case something fails during installation please refer to this tutorial: https://github.com/green-striped-gecko/dartR/wiki/Installation-tutorial.
## 
## For information how to cite dartR, please use:
## citation('dartR')
## Global verbosity is set to: 2
## 
## **** Have fun using dartR! ****
## 
## Attaching package: 'dartR'
## The following objects are masked from 'package:dartR.data':
## 
##     bandicoot.gl, possums.gl, testset.gl, testset.gs
#library(MASS) #do not load until Mantel test
library(Cairo)

1 Estimate Fst for European populations with at least 4 individuals using r<0.1 SNP set (Set 2)

Create list of populations

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

awk '{print $1}' output/snps_sets/r2_0.1.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4fst.txt;
head  output/fst/pops_4fst.txt;
wc -l output/fst/pops_4fst.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## 40 output/fst/pops_4fst.txt

We have 40 populations with 4 or more mosquitoes.

1.1 Convert to raw format and subset the bed file

First load plink

module load PLINK/1.9b_6.21-x86_64
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.1 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.1.log

47484 variants loaded from .bim file. –keep-fam: 407 people remaining. Total genotyping rate in remaining samples is 0.97243. 47484 variants and 407 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

LD2 <-
  read.PLINK(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r2_0.1.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )
## 
##  Reading PLINK raw format into a genlight object...
## Loading required package: parallel
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
summary(LD2)
##   Length    Class     Mode 
##        1 genlight       S4

1.2 Convert the genlight object to Stampp format, and estimate pairwide Fst values

The command below would also work, but you can simplify it and put only the numbers: genome_equal_2 <- stamppFst(neutral, nboots=100, percent=95 + nclusters==10)

This chunk will take a couple minutes to run.

# convert
LD2_2 <- stamppConvert(LD2, type="genlight")
# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
LD2_3 <- stamppFst(LD2_2, 1, 95, 1)

Save it

saveRDS(
  LD2_3, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2.rds"
  )
)

To load it

LD2_3 <- readRDS(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2.rds"
  )
)

Now lets look at the object

summary(LD2_3)
##       SOC                SEV               GES               KER         
##  Min.   :0.007013   Min.   :0.03129   Min.   :0.01664   Min.   :0.03044  
##  1st Qu.:0.092839   1st Qu.:0.11584   1st Qu.:0.10032   1st Qu.:0.07663  
##  Median :0.111269   Median :0.13359   Median :0.11908   Median :0.09316  
##  Mean   :0.105796   Mean   :0.13094   Mean   :0.11590   Mean   :0.09695  
##  3rd Qu.:0.125958   3rd Qu.:0.15214   3rd Qu.:0.13293   3rd Qu.:0.10914  
##  Max.   :0.224918   Max.   :0.24959   Max.   :0.23132   Max.   :0.20467  
##  NA's   :1          NA's   :2         NA's   :3         NA's   :4        
##       KRA               TIK               RAR               TRE         
##  Min.   :0.01118   Min.   :0.05273   Min.   :0.05224   Min.   :0.02291  
##  1st Qu.:0.07272   1st Qu.:0.11849   1st Qu.:0.10744   1st Qu.:0.03536  
##  Median :0.08882   Median :0.13388   Median :0.12240   Median :0.05226  
##  Mean   :0.09044   Mean   :0.13637   Mean   :0.12788   Mean   :0.05576  
##  3rd Qu.:0.10320   3rd Qu.:0.14994   3rd Qu.:0.13898   3rd Qu.:0.06262  
##  Max.   :0.19549   Max.   :0.24710   Max.   :0.23435   Max.   :0.15338  
##  NA's   :5         NA's   :6         NA's   :7         NA's   :8        
##       ALU               STS               SIC               BRE         
##  Min.   :0.06457   Min.   :0.03029   Min.   :0.04408   Min.   :0.08652  
##  1st Qu.:0.07981   1st Qu.:0.05243   1st Qu.:0.06313   1st Qu.:0.11993  
##  Median :0.09611   Median :0.07010   Median :0.08064   Median :0.13676  
##  Mean   :0.10160   Mean   :0.07388   Mean   :0.08414   Mean   :0.13995  
##  3rd Qu.:0.10865   3rd Qu.:0.07797   3rd Qu.:0.09302   3rd Qu.:0.15482  
##  Max.   :0.19928   Max.   :0.17568   Max.   :0.19265   Max.   :0.24730  
##  NA's   :9         NA's   :10        NA's   :11        NA's   :12       
##       DES               CES              TIR                 IMP         
##  Min.   :0.07658   Min.   :0.1313   Min.   :-0.001035   Min.   :0.04641  
##  1st Qu.:0.10645   1st Qu.:0.1513   1st Qu.: 0.062685   1st Qu.:0.05972  
##  Median :0.12447   Median :0.1740   Median : 0.073933   Median :0.08137  
##  Mean   :0.12570   Mean   :0.1738   Mean   : 0.082503   Mean   :0.08277  
##  3rd Qu.:0.13847   3rd Qu.:0.1849   3rd Qu.: 0.100562   3rd Qu.:0.09006  
##  Max.   :0.22575   Max.   :0.2813   Max.   : 0.221360   Max.   :0.22977  
##  NA's   :13        NA's   :14       NA's   :15          NA's   :16       
##       ROM                GRC               BAR               BUL         
##  Min.   :0.006715   Min.   :0.02337   Min.   :0.04513   Min.   :0.05459  
##  1st Qu.:0.043622   1st Qu.:0.05981   1st Qu.:0.08039   1st Qu.:0.06945  
##  Median :0.065630   Median :0.07719   Median :0.09870   Median :0.08783  
##  Mean   :0.066354   Mean   :0.08120   Mean   :0.10234   Mean   :0.09073  
##  3rd Qu.:0.079526   3rd Qu.:0.09579   3rd Qu.:0.12077   3rd Qu.:0.09910  
##  Max.   :0.208063   Max.   :0.19013   Max.   :0.21865   Max.   :0.19947  
##  NA's   :17         NA's   :18        NA's   :19        NA's   :20       
##       CRO               GRA               ITB               MAL         
##  Min.   :0.02801   Min.   :0.04833   Min.   :0.05117   Min.   :0.01902  
##  1st Qu.:0.05670   1st Qu.:0.05786   1st Qu.:0.06880   1st Qu.:0.03881  
##  Median :0.06688   Median :0.07433   Median :0.08339   Median :0.05639  
##  Mean   :0.07571   Mean   :0.07941   Mean   :0.09148   Mean   :0.05939  
##  3rd Qu.:0.09042   3rd Qu.:0.09380   3rd Qu.:0.10154   3rd Qu.:0.07280  
##  Max.   :0.17399   Max.   :0.18378   Max.   :0.22018   Max.   :0.16306  
##  NA's   :21        NA's   :22        NA's   :23        NA's   :24       
##       SPM                TUA               TUH               ALD         
##  Min.   :0.006964   Min.   :0.02906   Min.   :0.03621   Min.   :0.02222  
##  1st Qu.:0.045323   1st Qu.:0.05758   1st Qu.:0.04658   1st Qu.:0.08246  
##  Median :0.056999   Median :0.07472   Median :0.07008   Median :0.09953  
##  Mean   :0.067238   Mean   :0.08082   Mean   :0.07368   Mean   :0.10658  
##  3rd Qu.:0.082719   3rd Qu.:0.09884   3rd Qu.:0.08475   3rd Qu.:0.12488  
##  Max.   :0.211535   Max.   :0.20012   Max.   :0.17513   Max.   :0.21635  
##  NA's   :25         NA's   :26        NA's   :27        NA's   :28       
##       FRS               ITP               POP               ROS         
##  Min.   :0.03816   Min.   :0.02283   Min.   :0.03474   Min.   :0.05727  
##  1st Qu.:0.04250   1st Qu.:0.04174   1st Qu.:0.05124   1st Qu.:0.08888  
##  Median :0.06173   Median :0.06601   Median :0.05630   Median :0.09642  
##  Mean   :0.06881   Mean   :0.07004   Mean   :0.06996   Mean   :0.10679  
##  3rd Qu.:0.07313   3rd Qu.:0.07749   3rd Qu.:0.07207   3rd Qu.:0.10830  
##  Max.   :0.16475   Max.   :0.17488   Max.   :0.16561   Max.   :0.20116  
##  NA's   :29        NA's   :30        NA's   :31        NA's   :32       
##       SER              SLO               SPC               SPB         
##  Min.   :0.1539   Min.   :0.04114   Min.   :0.00677   Min.   :0.05962  
##  1st Qu.:0.1757   1st Qu.:0.05356   1st Qu.:0.06874   1st Qu.:0.07442  
##  Median :0.1885   Median :0.05762   Median :0.08788   Median :0.08698  
##  Mean   :0.1932   Mean   :0.06054   Mean   :0.08103   Mean   :0.08964  
##  3rd Qu.:0.2100   3rd Qu.:0.06235   3rd Qu.:0.10608   3rd Qu.:0.10220  
##  Max.   :0.2387   Max.   :0.09073   Max.   :0.13571   Max.   :0.12500  
##  NA's   :33       NA's   :34        NA's   :35        NA's   :36       
##       SPS               ARM               ALV               ITR     
##  Min.   :0.07023   Min.   :0.06867   Min.   :0.05863   Min.   : NA  
##  1st Qu.:0.09195   1st Qu.:0.07298   1st Qu.:0.05863   1st Qu.: NA  
##  Median :0.11366   Median :0.07730   Median :0.05863   Median : NA  
##  Mean   :0.09958   Mean   :0.07730   Mean   :0.05863   Mean   :NaN  
##  3rd Qu.:0.11425   3rd Qu.:0.08161   3rd Qu.:0.05863   3rd Qu.: NA  
##  Max.   :0.11484   Max.   :0.08593   Max.   :0.05863   Max.   : NA  
##  NA's   :37        NA's   :38        NA's   :39        NA's   :40

If you want you can save the fst values as csv.

# Convert to data frame
LD2_df <- data.frame(LD2_3)
# Save it
write.csv(LD2_df, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_df.csv"))

Check the Fst values

head(LD2_df)
##             SOC        SEV        GES        KER        KRA TIK RAR TRE ALU STS
## SOC          NA         NA         NA         NA         NA  NA  NA  NA  NA  NA
## SEV 0.045552200         NA         NA         NA         NA  NA  NA  NA  NA  NA
## GES 0.011806713 0.05534614         NA         NA         NA  NA  NA  NA  NA  NA
## KER 0.045624567 0.06766476 0.05379764         NA         NA  NA  NA  NA  NA  NA
## KRA 0.007012644 0.03129251 0.01664437 0.03044281         NA  NA  NA  NA  NA  NA
## TIK 0.057181751 0.07990133 0.05211205 0.07167703 0.03563554  NA  NA  NA  NA  NA
##     SIC BRE DES CES TIR IMP ROM GRC BAR BUL CRO GRA ITB MAL SPM TUA TUH ALD FRS
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## SEV  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KER  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KRA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TIK  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
##     ITP POP ROS SER SLO SPC SPB SPS ARM ALV ITR
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## SEV  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KER  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KRA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TIK  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Convert the data into a matrix.

aa <- as.matrix(LD2_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##             SOC        SEV        GES        KER         KRA        TIK
## SOC          NA 0.04555220 0.01180671 0.04562457 0.007012644 0.05718175
## SEV 0.045552200         NA 0.05534614 0.06766476 0.031292513 0.07990133
## GES 0.011806713 0.05534614         NA 0.05379764 0.016644374 0.05211205
## KER 0.045624567 0.06766476 0.05379764         NA 0.030442814 0.07167703
## KRA 0.007012644 0.03129251 0.01664437 0.03044281          NA 0.03563554
## TIK 0.057181751 0.07990133 0.05211205 0.07167703 0.035635540         NA
##            RAR        TRE        ALU        STS        SIC       BRE       DES
## SOC 0.02146036 0.09313894 0.06849806 0.10571972 0.11554876 0.1739014 0.1604400
## SEV 0.05349749 0.11338435 0.09532910 0.12661620 0.13880220 0.1942978 0.1809121
## GES 0.02883972 0.09789464 0.08423622 0.10896011 0.12121523 0.1777287 0.1653767
## KER 0.05216969 0.07079005 0.06547083 0.08191323 0.09665681 0.1507775 0.1370573
## KRA 0.01117963 0.06857930 0.05392869 0.08213222 0.09068031 0.1479356 0.1356648
## TIK 0.05273252 0.11033213 0.08909356 0.12253683 0.13577321 0.1902386 0.1766860
##           CES        TIR        IMP        ROM        GRC       BAR       BUL
## SOC 0.2035848 0.12595232 0.12040973 0.10473003 0.11390767 0.1342407 0.1259630
## SEV 0.2237784 0.15228764 0.14817276 0.13069532 0.13271938 0.1566361 0.1478450
## GES 0.2077032 0.13293266 0.12751597 0.10999453 0.11954040 0.1399889 0.1311498
## KER 0.1781829 0.10983965 0.09348153 0.08718079 0.09922120 0.1219008 0.1092024
## KRA 0.1783668 0.09963636 0.09342591 0.07912006 0.08852899 0.1098279 0.1018435
## TIK 0.2206370 0.15084214 0.14563127 0.12846942 0.13162156 0.1512489 0.1460110
##            CRO        GRA       ITB        MAL        SPM        TUA        TUH
## SOC 0.11285209 0.10389647 0.1321288 0.09253993 0.11126944 0.11525321 0.10366569
## SEV 0.13446449 0.12468818 0.1565682 0.11566729 0.13877342 0.13632772 0.12331465
## GES 0.11881055 0.10942797 0.1379516 0.09895866 0.11908360 0.12052592 0.10931793
## KER 0.09867196 0.09173843 0.1091228 0.07477024 0.08913167 0.10089836 0.08500275
## KRA 0.09054381 0.08042505 0.1055944 0.07104441 0.08321229 0.08881825 0.07810616
## TIK 0.13095324 0.12225653 0.1554350 0.11274247 0.13430953 0.13345426 0.12082138
##           ALD        FRS        ITP        POP       ROS       SER        SLO
## SOC 0.1380274 0.10079113 0.09753798 0.09704470 0.1306794 0.2249183 0.08859806
## SEV 0.1589880 0.12036408 0.12098394 0.11863814 0.1517054 0.2495944 0.10848958
## GES 0.1437654 0.10515731 0.10354928 0.10227705 0.1357736 0.2313209 0.09341733
## KER 0.1242578 0.07724840 0.07881705 0.07343890 0.1090094 0.2046692 0.07183871
## KRA 0.1134517 0.07524388 0.07279459 0.07265533 0.1045599 0.1954888 0.06240033
## TIK 0.1572443 0.11835137 0.11890137 0.11648063 0.1472208 0.2471006 0.10528389
##            SPC        SPB       SPS        ARM        ALV        ITR
## SOC 0.12461422 0.11407133 0.1477816 0.04422243 0.11593610 0.09554091
## SEV 0.15272739 0.13981094 0.1702386 0.07107957 0.13792904 0.11634052
## GES 0.13137300 0.11978821 0.1545255 0.05484410 0.12252625 0.10032352
## KER 0.10189184 0.09283876 0.1304073 0.04004856 0.10166083 0.07891624
## KRA 0.09834158 0.08996602 0.1225046 0.01855860 0.09369755 0.07150888
## TIK 0.14579627 0.13577266 0.1681516 0.06652363 0.13457140 0.11344720

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))


# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    order
  )

# Check it
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##   [1] "BER" "COL" "PAL" "HOU" "LOS" "MAU" "REC" "GRV" "FRS" "STS" "POP" "POL"
##  [13] "SPB" "SPS" "SPC" "BAR" "SPM" "IMP" "ITG" "BRE" "DES" "TRE" "ITB" "CES"
##  [25] "ROM" "ITR" "SIC" "ITP" "MAL" "SLO" "CRO" "ALV" "ALD" "TIR" "SER" "GRA"
##  [37] "GRC" "ROS" "BUL" "TUA" "TUH" "SEV" "ALU" "KER" "KRA" "SOC" "TIK" "RAR"
##  [49] "GES" "ARM" "KAN" "UTS" "KAG" "OKI" "HAI" "YUN" "HUN" "TAI" "GEL" "BEN"
##  [61] "KUN" "KAT" "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC"
##  [73] "SON" "CHA" "LAM" "HAN" "HOC" "QNC" "LIB" "MAD" "TRO" "DAU" "JAT" "YAT"
##  [85] "GAB" "ANT" "DGV" "VOH" "RAB" "YAO" "AWK" "BRM" "JAM" "SAI" "BEA" "CHI"
##  [97] "DAL" "FAY" "MAC" "MAN" "NEO" "NEW" "NUE" "PEO" "RUS" "SPR" "POR" "NOV"
## [109] "TUC" "MED" "AIZ" "HIR" "KHO" "KYO" "NAG" "NIG" "SAG" "SAK" "SEN" "TAN"
LD2_df_ordered <- read_csv(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_df_odered.csv"
)
## New names:
## Rows: 40 Columns: 41
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (40): FRS, STS, POP, SPB, SPS, SPC, BAR, SPM, IMP, BRE, DES, TRE,
## ITB, C...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Below does not change order Create vector with order of populations

# Extract the populations that appear in LD2_df
populations_in_LD2 <- colnames(LD2_df_ordered)

# Reorder the populations based on order_pops
poporder <- populations_in_LD2[populations_in_LD2 %in% order_pops]

#LD2_df[match(poporder, LD2_df$Abbreviation),] #this also doesn't reorder it

# Print the reordered populations
print(poporder)
##  [1] "FRS" "STS" "POP" "SPB" "SPS" "SPC" "BAR" "SPM" "IMP" "BRE" "DES" "TRE"
## [13] "ITB" "CES" "ROM" "ITR" "SIC" "ITP" "MAL" "SLO" "CRO" "ALV" "ALD" "TIR"
## [25] "SER" "GRA" "GRC" "ROS" "BUL" "TUA" "TUH" "SEV" "ALU" "KER" "KRA" "SOC"
## [37] "TIK" "RAR" "GES" "ARM"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1           Var2          value        
##  FRS    :  40   FRS    :  40   Min.   :-0.0010  
##  STS    :  40   STS    :  40   1st Qu.: 0.0665  
##  POP    :  40   POP    :  40   Median : 0.0931  
##  SPB    :  40   SPB    :  40   Mean   : 0.0991  
##  SPS    :  40   SPS    :  40   3rd Qu.: 0.1243  
##  SPC    :  40   SPC    :  40   Max.   : 0.2813  
##  (Other):1360   (Other):1360   NA's   :820

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 2) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

Save it

ggsave(
  filename = here("output", "europe", "fst", "fst_matrix_europe_LD2_ordered.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

1.3 Fst by country for Europe SNP Set 2

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(LD2_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

#save the fst values as csv
write.csv(fst_summary, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/fst_summary1.csv"))

fst_summary_ordered <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/fst_summary1_ordered.txt")
#fst_summary_ordered <- data.frame(fst_summary_ordered)
fst_summary_ordered <- read.table(fst_summary_ordered,
                                  header=TRUE)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary_ordered, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##             Albania    Armenia   Bulgaria    Croatia     France    Georgia
## Albania          NA 0.09231399 0.10160985 0.02884265 0.08188077 0.13307477
## Armenia  0.09231399         NA 0.09402725 0.08299684 0.06455227 0.05484410
## Bulgaria 0.10160985 0.09402725         NA 0.08739974 0.07533960 0.13114976
## Croatia  0.02884265 0.08299684 0.08739974         NA 0.06454581 0.11881055
## France   0.08188077 0.06455227 0.07533960 0.06454581         NA 0.10705871
## Georgia  0.13307477 0.05484410 0.13114976 0.11881055 0.10705871         NA
## Greece   0.06106728 0.07997747 0.09085677 0.05464731 0.07011300 0.11448419
## Italy    0.09996591 0.08046541 0.09077892 0.08415611 0.07080554 0.13492534
## Malta    0.07007296 0.06246256 0.05837692 0.05617341 0.03917767 0.09895866
## Portugal 0.07553921 0.05447484 0.06655260 0.06430580 0.04306543 0.10227705
## Romania  0.10978487 0.09224141 0.10230543 0.09812481 0.07432071 0.13577364
## Russia   0.12792279 0.04538750 0.12692205 0.11359257 0.10317288 0.02216938
## Serbia   0.20158615 0.18849691 0.19946824 0.17399103 0.17021355 0.23132092
## Slovenia 0.06627041 0.05313194 0.06563008 0.05813764 0.04427634 0.09341733
## Spain    0.10947480 0.08133491 0.10628290 0.09472900 0.07205435 0.13295185
## Turkey   0.08085839 0.07780083 0.07721910 0.07165525 0.06093551 0.11492193
## Ukraine  0.12214703 0.05856725 0.12052216 0.10875348 0.09514142 0.06218153
##              Greece      Italy      Malta   Portugal    Romania     Russia
## Albania  0.06106728 0.09996591 0.07007296 0.07553921 0.10978487 0.12792279
## Armenia  0.07997747 0.08046541 0.06246256 0.05447484 0.09224141 0.04538750
## Bulgaria 0.09085677 0.09077892 0.05837692 0.06655260 0.10230543 0.12692205
## Croatia  0.05464731 0.08415611 0.05617341 0.06430580 0.09812481 0.11359257
## France   0.07011300 0.07080554 0.03917767 0.04306543 0.07432071 0.10317288
## Georgia  0.11448419 0.13492534 0.09895866 0.10227705 0.13577364 0.02216938
## Greece           NA 0.08772003 0.05425446 0.06662748 0.09981326 0.10892420
## Italy    0.08772003         NA 0.04668323 0.04895747 0.09078298 0.13062252
## Malta    0.05425446 0.04668323         NA 0.03008816 0.07279516 0.09457124
## Portugal 0.06662748 0.04895747 0.03008816         NA 0.07206916 0.09820792
## Romania  0.09981326 0.09078298 0.07279516 0.07206916         NA 0.13035972
## Russia   0.10892420 0.13062252 0.09457124 0.09820792 0.13035972         NA
## Serbia   0.18695824 0.19097858 0.16306302 0.16561275 0.20116330 0.22546408
## Slovenia 0.05637978 0.05224372 0.03900747 0.03651465 0.05726610 0.08821314
## Spain    0.09375979 0.08986473 0.07238616 0.06471743 0.10652495 0.12772002
## Turkey   0.07160055 0.07018211 0.04627632 0.05506548 0.09143886 0.10920612
## Ukraine  0.10325455 0.09842950 0.08717645 0.09011392 0.12408803 0.05788560
##             Serbia   Slovenia      Spain     Turkey    Ukraine
## Albania  0.2015861 0.06627041 0.10947480 0.08085839 0.12214703
## Armenia  0.1884969 0.05313194 0.08133491 0.07780083 0.05856725
## Bulgaria 0.1994682 0.06563008 0.10628290 0.07721910 0.12052216
## Croatia  0.1739910 0.05813764 0.09472900 0.07165525 0.10875348
## France   0.1702136 0.04427634 0.07205435 0.06093551 0.09514142
## Georgia  0.2313209 0.09341733 0.13295185 0.11492193 0.06218153
## Greece   0.1869582 0.05637978 0.09375979 0.07160055 0.10325455
## Italy    0.1909786 0.05224372 0.08986473 0.07018211 0.09842950
## Malta    0.1630630 0.03900747 0.07238616 0.04627632 0.08717645
## Portugal 0.1656127 0.03651465 0.06471743 0.05506548 0.09011392
## Romania  0.2011633 0.05726610 0.10652495 0.09143886 0.12408803
## Russia   0.2254641 0.08821314 0.12772002 0.10920612 0.05788560
## Serbia          NA 0.15393428 0.21730466 0.18762268 0.21784855
## Slovenia 0.1539343         NA 0.06472219 0.05217992 0.08222846
## Spain    0.2173047 0.06472219         NA 0.07187648 0.12136587
## Turkey   0.1876227 0.05217992 0.07187648         NA 0.10307601
## Ukraine  0.2178485 0.08222846 0.12136587 0.10307601         NA

1.3.1 Reorder

# Read the file
country_order <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/country_order.txt")
country_order <- read.table(country_order, 
                       header = FALSE,
                       col.names = c("country"))

order_countries <- as.vector(country_order$country)
order_countries
##  [1] "France"   "Portugal" "Spain"    "Italy"    "Malta"    "Slovenia"
##  [7] "Croatia"  "Albania"  "Serbia"   "Greece"   "Romania"  "Bulgaria"
## [13] "Turkey"   "Ukraine"  "Russia"   "Georgia"  "Armenia"
write.csv(symmetric_fst_matrix, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/symmetric_fst_matrix.csv"))

symmetric_fst_matrix_ordered <- read_csv(
 "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/symmetric_fst_matrix_ordered.csv"
)
## New names:
## Rows: 17 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (17): France, Portugal, Spain, Italy, Malta, Slovenia, Croatia,
## Albania,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
symmetric_fst_matrix_ordered<- as.data.frame(symmetric_fst_matrix_ordered) #convert to dataframe
rownames(symmetric_fst_matrix_ordered) <- symmetric_fst_matrix_ordered$...1 #use first column as rownames
symmetric_fst_matrix_ordered <- symmetric_fst_matrix_ordered[ -c(1) ] #remove 1st column
symmetric_fst_matrix_ordered <-as.matrix(symmetric_fst_matrix_ordered) #convert to matrix
symmetric_fst_matrix_ordered[lower.tri(symmetric_fst_matrix_ordered)] <- NA
print(symmetric_fst_matrix_ordered)
##          France   Portugal      Spain      Italy      Malta   Slovenia
## France       NA 0.04306543 0.07205435 0.07080554 0.03917767 0.04427634
## Portugal     NA         NA 0.06471743 0.04895747 0.03008816 0.03651465
## Spain        NA         NA         NA 0.08986473 0.07238616 0.06472219
## Italy        NA         NA         NA         NA 0.04668323 0.05224372
## Malta        NA         NA         NA         NA         NA 0.03900747
## Slovenia     NA         NA         NA         NA         NA         NA
## Croatia      NA         NA         NA         NA         NA         NA
## Albania      NA         NA         NA         NA         NA         NA
## Serbia       NA         NA         NA         NA         NA         NA
## Greece       NA         NA         NA         NA         NA         NA
## Romania      NA         NA         NA         NA         NA         NA
## Bulgaria     NA         NA         NA         NA         NA         NA
## Turkey       NA         NA         NA         NA         NA         NA
## Ukraine      NA         NA         NA         NA         NA         NA
## Russia       NA         NA         NA         NA         NA         NA
## Georgia      NA         NA         NA         NA         NA         NA
## Armenia      NA         NA         NA         NA         NA         NA
##             Croatia    Albania    Serbia     Greece    Romania   Bulgaria
## France   0.06454582 0.08188078 0.1702136 0.07011300 0.07432071 0.07533960
## Portugal 0.06430580 0.07553921 0.1656127 0.06662748 0.07206916 0.06655260
## Spain    0.09472900 0.10947480 0.2173047 0.09375979 0.10652494 0.10628290
## Italy    0.08415611 0.09996591 0.1909786 0.08772003 0.09078298 0.09077892
## Malta    0.05617341 0.07007296 0.1630630 0.05425446 0.07279516 0.05837692
## Slovenia 0.05813764 0.06627041 0.1539343 0.05637978 0.05726610 0.06563008
## Croatia          NA 0.02884265 0.1739910 0.05464731 0.09812481 0.08739974
## Albania          NA         NA 0.2015861 0.06106728 0.10978487 0.10160985
## Serbia           NA         NA        NA 0.18695824 0.20116330 0.19946824
## Greece           NA         NA        NA         NA 0.09981326 0.09085677
## Romania          NA         NA        NA         NA         NA 0.10230543
## Bulgaria         NA         NA        NA         NA         NA         NA
## Turkey           NA         NA        NA         NA         NA         NA
## Ukraine          NA         NA        NA         NA         NA         NA
## Russia           NA         NA        NA         NA         NA         NA
## Georgia          NA         NA        NA         NA         NA         NA
## Armenia          NA         NA        NA         NA         NA         NA
##              Turkey    Ukraine     Russia    Georgia    Armenia
## France   0.06093551 0.09514142 0.10317288 0.10705871 0.06455227
## Portugal 0.05506548 0.09011392 0.09820792 0.10227705 0.05447484
## Spain    0.07187648 0.12136587 0.12772002 0.13295185 0.08133491
## Italy    0.07018211 0.09842950 0.13062252 0.13492534 0.08046541
## Malta    0.04627632 0.08717645 0.09457124 0.09895866 0.06246256
## Slovenia 0.05217992 0.08222846 0.08821314 0.09341733 0.05313194
## Croatia  0.07165525 0.10875348 0.11359257 0.11881055 0.08299684
## Albania  0.08085839 0.12214703 0.12792279 0.13307477 0.09231399
## Serbia   0.18762268 0.21784855 0.22546408 0.23132092 0.18849691
## Greece   0.07160055 0.10325455 0.10892420 0.11448419 0.07997747
## Romania  0.09143886 0.12408803 0.13035972 0.13577364 0.09224141
## Bulgaria 0.07721910 0.12052216 0.12692205 0.13114976 0.09402725
## Turkey           NA 0.10307601 0.10920612 0.11492193 0.07780083
## Ukraine          NA         NA 0.05788560 0.06218153 0.05856725
## Russia           NA         NA         NA 0.02216938 0.04538750
## Georgia          NA         NA         NA         NA 0.05484410
## Armenia          NA         NA         NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix_ordered)
summary(pairfst.long2)
##        Var1           Var2         value        
##  France  : 17   France  : 17   Min.   :0.02217  
##  Portugal: 17   Portugal: 17   1st Qu.:0.06468  
##  Spain   : 17   Spain   : 17   Median :0.08904  
##  Italy   : 17   Italy   : 17   Mean   :0.09569  
##  Malta   : 17   Malta   : 17   3rd Qu.:0.11074  
##  Slovenia: 17   Slovenia: 17   Max.   :0.23132  
##  (Other) :187   (Other) :187   NA's   :153

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "europe", "fst", "fst_matrix_Europe_LD2_by_country_ordered.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 40 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 FRS     0.0746
##  2 STS     0.0789
##  3 POP     0.0704
##  4 SPB     0.0897
##  5 SPS     0.120 
##  6 SPC     0.100 
##  7 BAR     0.114 
##  8 SPM     0.0810
##  9 IMP     0.0934
## 10 BRE     0.144 
## # ℹ 30 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

head(fst3)
##               Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1 Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2           Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3             Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4              Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5            San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6            Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
##     Subregion order order2 orderold   mean_fst
## 1 West Europe     9      1        1 0.07462900
## 2 West Europe    10      2        2 0.07891003
## 3 West Europe    11      3        3 0.07039614
## 4 West Europe    13      5        5 0.08966956
## 5 West Europe    14      6        6 0.11997670
## 6 West Europe    15      7        7 0.10001777

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Subregion) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Subregion")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order order2 orderold   mean_fst mean_fst_by_region
## 1  West Europe     9      1        1 0.07462900                0.1
## 2  West Europe    10      2        2 0.07891003                0.1
## 3  West Europe    11      3        3 0.07039614                0.1
## 4  West Europe    13      5        5 0.08966956                0.1
## 5  West Europe    14      6        6 0.11997670                0.1
## 6  West Europe    15      7        7 0.10001777                0.1
## 7  West Europe    16      8        8 0.11361455                0.1
## 8  West Europe    17      9        9 0.08101992                0.1
## 9  West Europe    18     10       10 0.09338559                0.1
## 10 West Europe    20     12       12 0.14417917                0.1
## 11 West Europe    21     13       13 0.12864170                0.1
## 12 West Europe    22     14       14 0.06256140                0.1
## 13 West Europe    23     15       15 0.10181636                0.1
## 14 West Europe    24     16       16 0.17560017                0.1
## 15 West Europe    25     17       17 0.07923011                0.1
## 16 West Europe    26     18       18 0.07069860                0.1
## 17 West Europe    27     19       19 0.08872802                0.1
## 18 West Europe    28     20       20 0.07094679                0.1
## 19 West Europe    29     21       21 0.06875044                0.1
## 20 East Europe    30     22       22 0.06683635                0.1
## 21 East Europe    31     23       23 0.08693351                0.1
## 22 East Europe    32     24       24 0.09045198                0.1
## 23 East Europe    33     25       25 0.11233441                0.1
## 24 East Europe    34     26       26 0.09810811                0.1
## 25 East Europe    35     27       27 0.20341532                0.1
## 26 East Europe    36     28       28 0.08644638                0.1
## 27 East Europe    37     29       29 0.09177586                0.1
## 28 East Europe    38     30       30 0.10597770                0.1
## 29 East Europe    39     31       31 0.10187716                0.1
## 30 East Europe    40     32       32 0.09142387                0.1
## 31 East Europe    41     33       33 0.07873389                0.1
## 32 East Europe    42     34       34 0.12875447                0.1
## 33 East Europe    43     35       35 0.09618709                0.1
## 34 East Europe    44     36       36 0.09378027                0.1
## 35 East Europe    45     37       37 0.08335369                0.1
## 36 East Europe    46     38       38 0.10579596                0.1
## 37 East Europe    47     39       39 0.12649180                0.1
## 38 East Europe    48     40       40 0.11384850                0.1
## 39 East Europe    49     41       41 0.11167952                0.1
## 40 East Europe    50     42       42 0.07881133                0.1

Mean By country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order order2 orderold   mean_fst mean_fst_by_region
## 1  West Europe     9      1        1 0.07462900                0.1
## 2  West Europe    10      2        2 0.07891003                0.1
## 3  West Europe    11      3        3 0.07039614                0.1
## 4  West Europe    13      5        5 0.08966956                0.1
## 5  West Europe    14      6        6 0.11997670                0.1
## 6  West Europe    15      7        7 0.10001777                0.1
## 7  West Europe    16      8        8 0.11361455                0.1
## 8  West Europe    17      9        9 0.08101992                0.1
## 9  West Europe    18     10       10 0.09338559                0.1
## 10 West Europe    20     12       12 0.14417917                0.1
## 11 West Europe    21     13       13 0.12864170                0.1
## 12 West Europe    22     14       14 0.06256140                0.1
## 13 West Europe    23     15       15 0.10181636                0.1
## 14 West Europe    24     16       16 0.17560017                0.1
## 15 West Europe    25     17       17 0.07923011                0.1
## 16 West Europe    26     18       18 0.07069860                0.1
## 17 West Europe    27     19       19 0.08872802                0.1
## 18 West Europe    28     20       20 0.07094679                0.1
## 19 West Europe    29     21       21 0.06875044                0.1
## 20 East Europe    30     22       22 0.06683635                0.1
## 21 East Europe    31     23       23 0.08693351                0.1
## 22 East Europe    32     24       24 0.09045198                0.1
## 23 East Europe    33     25       25 0.11233441                0.1
## 24 East Europe    34     26       26 0.09810811                0.1
## 25 East Europe    35     27       27 0.20341532                0.1
## 26 East Europe    36     28       28 0.08644638                0.1
## 27 East Europe    37     29       29 0.09177586                0.1
## 28 East Europe    38     30       30 0.10597770                0.1
## 29 East Europe    39     31       31 0.10187716                0.1
## 30 East Europe    40     32       32 0.09142387                0.1
## 31 East Europe    41     33       33 0.07873389                0.1
## 32 East Europe    42     34       34 0.12875447                0.1
## 33 East Europe    43     35       35 0.09618709                0.1
## 34 East Europe    44     36       36 0.09378027                0.1
## 35 East Europe    45     37       37 0.08335369                0.1
## 36 East Europe    46     38       38 0.10579596                0.1
## 37 East Europe    47     39       39 0.12649180                0.1
## 38 East Europe    48     40       40 0.11384850                0.1
## 39 East Europe    49     41       41 0.11167952                0.1
## 40 East Europe    50     42       42 0.07881133                0.1
##    mean_fst_by_country
## 1                 0.08
## 2                 0.08
## 3                 0.07
## 4                 0.10
## 5                 0.10
## 6                 0.10
## 7                 0.10
## 8                 0.10
## 9                 0.10
## 10                0.10
## 11                0.10
## 12                0.10
## 13                0.10
## 14                0.10
## 15                0.10
## 16                0.10
## 17                0.10
## 18                0.10
## 19                0.07
## 20                0.07
## 21                0.09
## 22                0.10
## 23                0.10
## 24                0.10
## 25                0.20
## 26                0.09
## 27                0.09
## 28                0.11
## 29                0.10
## 30                0.09
## 31                0.09
## 32                0.11
## 33                0.11
## 34                0.11
## 35                0.11
## 36                0.11
## 37                0.11
## 38                0.11
## 39                0.11
## 40                0.08

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 40, "Above 40N", "Below 40N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City)

# Print the modified fst3 tibble
print(fst3)
##                    City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order order2 orderold   mean_fst mean_fst_by_region
## 1  West Europe     9      1        1 0.07462900                0.1
## 2  West Europe    10      2        2 0.07891003                0.1
## 3  West Europe    11      3        3 0.07039614                0.1
## 4  West Europe    13      5        5 0.08966956                0.1
## 5  West Europe    14      6        6 0.11997670                0.1
## 6  West Europe    15      7        7 0.10001777                0.1
## 7  West Europe    16      8        8 0.11361455                0.1
## 8  West Europe    17      9        9 0.08101992                0.1
## 9  West Europe    18     10       10 0.09338559                0.1
## 10 West Europe    20     12       12 0.14417917                0.1
## 11 West Europe    21     13       13 0.12864170                0.1
## 12 West Europe    22     14       14 0.06256140                0.1
## 13 West Europe    23     15       15 0.10181636                0.1
## 14 West Europe    24     16       16 0.17560017                0.1
## 15 West Europe    25     17       17 0.07923011                0.1
## 16 West Europe    26     18       18 0.07069860                0.1
## 17 West Europe    27     19       19 0.08872802                0.1
## 18 West Europe    28     20       20 0.07094679                0.1
## 19 West Europe    29     21       21 0.06875044                0.1
## 20 East Europe    30     22       22 0.06683635                0.1
## 21 East Europe    31     23       23 0.08693351                0.1
## 22 East Europe    32     24       24 0.09045198                0.1
## 23 East Europe    33     25       25 0.11233441                0.1
## 24 East Europe    34     26       26 0.09810811                0.1
## 25 East Europe    35     27       27 0.20341532                0.1
## 26 East Europe    36     28       28 0.08644638                0.1
## 27 East Europe    37     29       29 0.09177586                0.1
## 28 East Europe    38     30       30 0.10597770                0.1
## 29 East Europe    39     31       31 0.10187716                0.1
## 30 East Europe    40     32       32 0.09142387                0.1
## 31 East Europe    41     33       33 0.07873389                0.1
## 32 East Europe    42     34       34 0.12875447                0.1
## 33 East Europe    43     35       35 0.09618709                0.1
## 34 East Europe    44     36       36 0.09378027                0.1
## 35 East Europe    45     37       37 0.08335369                0.1
## 36 East Europe    46     38       38 0.10579596                0.1
## 37 East Europe    47     39       39 0.12649180                0.1
## 38 East Europe    48     40       40 0.11384850                0.1
## 39 East Europe    49     41       41 0.11167952                0.1
## 40 East Europe    50     42       42 0.07881133                0.1
##    mean_fst_by_country Latitude_group mean_fst_by_latitude
## 1                 0.08      Above 40N           0.10154776
## 2                 0.08      Above 40N           0.10154776
## 3                 0.07      Above 40N           0.10154776
## 4                 0.10      Below 40N           0.09086761
## 5                 0.10      Below 40N           0.09086761
## 6                 0.10      Below 40N           0.09086761
## 7                 0.10      Above 40N           0.10154776
## 8                 0.10      Below 40N           0.09086761
## 9                 0.10      Above 40N           0.10154776
## 10                0.10      Above 40N           0.10154776
## 11                0.10      Above 40N           0.10154776
## 12                0.10      Above 40N           0.10154776
## 13                0.10      Above 40N           0.10154776
## 14                0.10      Above 40N           0.10154776
## 15                0.10      Above 40N           0.10154776
## 16                0.10      Above 40N           0.10154776
## 17                0.10      Below 40N           0.09086761
## 18                0.10      Above 40N           0.10154776
## 19                0.07      Below 40N           0.09086761
## 20                0.07      Above 40N           0.10154776
## 21                0.09      Above 40N           0.10154776
## 22                0.10      Above 40N           0.10154776
## 23                0.10      Above 40N           0.10154776
## 24                0.10      Above 40N           0.10154776
## 25                0.20      Above 40N           0.10154776
## 26                0.09      Below 40N           0.09086761
## 27                0.09      Below 40N           0.09086761
## 28                0.11      Above 40N           0.10154776
## 29                0.10      Above 40N           0.10154776
## 30                0.09      Below 40N           0.09086761
## 31                0.09      Above 40N           0.10154776
## 32                0.11      Above 40N           0.10154776
## 33                0.11      Above 40N           0.10154776
## 34                0.11      Above 40N           0.10154776
## 35                0.11      Above 40N           0.10154776
## 36                0.11      Above 40N           0.10154776
## 37                0.11      Above 40N           0.10154776
## 38                0.11      Above 40N           0.10154776
## 39                0.11      Above 40N           0.10154776
## 40                0.08      Above 40N           0.10154776
fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Subregion, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Subregion, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
##   Latitude_group mean_fst_by_latitude   Subregion mean_fst_by_region  Country
## 1      Above 40N                  0.1 East Europe                0.1  Albania
## 2      Above 40N                  0.1 East Europe                0.1  Albania
## 3      Above 40N                  0.1 East Europe                0.1  Albania
## 4      Above 40N                  0.1 East Europe                0.1  Armenia
## 5      Above 40N                  0.1 East Europe                0.1 Bulgaria
## 6      Above 40N                  0.1 East Europe                0.1  Croatia
##   mean_fst_by_country      City Abbreviation mean_fst
## 1                0.10    Durres          ALD     0.11
## 2                0.10    Tirana          TIR     0.10
## 3                0.10     Vlore          ALV     0.09
## 4                0.08    Ijevan          ARM     0.08
## 5                0.10       Lom          BUL     0.10
## 6                0.09 Dubrovnik          CRO     0.09
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values for Europe using LD2 SNPs.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values for Europe using LD2 SNPs.

Latitude_group

mean_fst_by_latitude

Subregion

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Above 40N

0.10

East Europe

0.1

Albania

0.10

Durres

ALD

0.11

Above 40N

0.10

East Europe

0.1

Albania

0.10

Tirana

TIR

0.10

Above 40N

0.10

East Europe

0.1

Albania

0.10

Vlore

ALV

0.09

Above 40N

0.10

East Europe

0.1

Armenia

0.08

Ijevan

ARM

0.08

Above 40N

0.10

East Europe

0.1

Bulgaria

0.10

Lom

BUL

0.10

Above 40N

0.10

East Europe

0.1

Croatia

0.09

Dubrovnik

CRO

0.09

Above 40N

0.10

East Europe

0.1

Georgia

0.11

Sakhumi, Abkhazia

GES

0.11

Above 40N

0.10

East Europe

0.1

Romania

0.11

Satu Mare

ROS

0.11

Above 40N

0.10

East Europe

0.1

Russia

0.11

Armavir

RAR

0.11

Above 40N

0.10

East Europe

0.1

Russia

0.11

Krasnodar

KRA

0.08

Above 40N

0.10

East Europe

0.1

Russia

0.11

Sochi

SOC

0.11

Above 40N

0.10

East Europe

0.1

Russia

0.11

Tikhoretsk

TIK

0.13

Above 40N

0.10

East Europe

0.1

Serbia

0.20

Novi Sad

SER

0.20

Above 40N

0.10

East Europe

0.1

Slovenia

0.07

Ajdovscina

SLO

0.07

Above 40N

0.10

East Europe

0.1

Turkey

0.09

Hopa

TUH

0.08

Above 40N

0.10

East Europe

0.1

Ukraine

0.11

Alushta

ALU

0.10

Above 40N

0.10

East Europe

0.1

Ukraine

0.11

Kerch, Crimea

KER

0.09

Above 40N

0.10

East Europe

0.1

Ukraine

0.11

Sevastopol, Crimea

SEV

0.13

Above 40N

0.10

West Europe

0.1

France

0.08

Saint-Martin-d'Heres

FRS

0.07

Above 40N

0.10

West Europe

0.1

France

0.08

Strasbourg

STS

0.08

Above 40N

0.10

West Europe

0.1

Italy

0.10

Bologna

ITB

0.10

Above 40N

0.10

West Europe

0.1

Italy

0.10

Brescia

BRE

0.14

Above 40N

0.10

West Europe

0.1

Italy

0.10

Cesena

CES

0.18

Above 40N

0.10

West Europe

0.1

Italy

0.10

Desenzano

DES

0.13

Above 40N

0.10

West Europe

0.1

Italy

0.10

Imperia

IMP

0.09

Above 40N

0.10

West Europe

0.1

Italy

0.10

Puglia

ITP

0.07

Above 40N

0.10

West Europe

0.1

Italy

0.10

Rome (Sapienza)

ROM

0.08

Above 40N

0.10

West Europe

0.1

Italy

0.10

Rome (Trappola)

ITR

0.07

Above 40N

0.10

West Europe

0.1

Italy

0.10

Trentino

TRE

0.06

Above 40N

0.10

West Europe

0.1

Portugal

0.07

Penafiel

POP

0.07

Above 40N

0.10

West Europe

0.1

Spain

0.10

Barcelona

BAR

0.11

Below 40N

0.09

East Europe

0.1

Greece

0.09

Athens

GRA

0.09

Below 40N

0.09

East Europe

0.1

Greece

0.09

Chania

GRC

0.09

Below 40N

0.09

East Europe

0.1

Turkey

0.09

Aliaga

TUA

0.09

Below 40N

0.09

West Europe

0.1

Italy

0.10

Sicilia

SIC

0.09

Below 40N

0.09

West Europe

0.1

Malta

0.07

Luqa

MAL

0.07

Below 40N

0.09

West Europe

0.1

Spain

0.10

Badajoz

SPB

0.09

Below 40N

0.09

West Europe

0.1

Spain

0.10

Catarroja

SPC

0.10

Below 40N

0.09

West Europe

0.1

Spain

0.10

Magaluf

SPM

0.08

Below 40N

0.09

West Europe

0.1

Spain

0.10

San Roque

SPS

0.12

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "europe",
  "fst", 
  "fst_Europe_LD2_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "europe", "fst", "LD2_country.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           FRS       STS       POP       SPB       SPS       SPC      BAR
## FRS    0.0000  412.1522 1225.4383 1263.7518 1371.4448  817.6702 511.8716
## STS  412.1522    0.0000 1509.1745 1601.1573 1750.4874 1213.5639 915.9614
## POP 1225.4383 1509.1745    0.0000  282.8409  614.5101  701.9619 878.2910
## SPB 1263.7518 1601.1573  282.8409    0.0000  331.7685  571.0509 827.0716
## SPS 1371.4448 1750.4874  614.5101  331.7685    0.0000  566.5153 874.4050
## SPC  817.6702 1213.5639  701.9619  571.0509  566.5153    0.0000 310.0074
##           SPM       IMP       BRE       DES       TRE       ITB       CES
## FRS  683.8845  228.2822  349.1505  375.3876  427.8791  448.0871  524.5893
## STS 1095.6072  527.9620  389.6504  409.5459  380.8019  536.0256  604.0707
## POP  939.4811 1370.3898 1572.0008 1596.9006 1653.2807 1644.8695 1711.3013
## SPB  822.8201 1367.0861 1594.7230 1616.9360 1682.2286 1643.6694 1701.8380
## SPS  787.3372 1423.5497 1670.1969 1688.5822 1761.4648 1690.3169 1736.8728
## SPC  251.7726  857.0395 1104.4760 1122.4598 1195.9602 1124.6677 1173.2270
##           ROM       ITR      SIC      ITP      MAL       SLO      CRO      ALV
## FRS  654.1062  654.1062 1119.260 1004.474 1269.904  628.7258 1037.845 1234.162
## STS  834.5057  834.5057 1314.450 1098.126 1522.503  546.2343 1052.803 1296.908
## POP 1734.4434 1734.4434 2064.584 2102.566 2066.730 1851.8481 2197.100 2334.720
## SPB 1683.0686 1683.0686 1957.316 2040.301 1923.429 1869.2123 2157.626 2266.196
## SPS 1669.1413 1669.1413 1865.247 2002.327 1785.308 1929.6259 2144.409 2215.487
## SPC 1124.2692 1124.2692 1387.383 1474.434 1368.139 1363.2153 1600.981 1697.872
##          ALD      TIR       SER      GRA      GRC      ROS      BUL      TUA
## FRS 1192.210 1216.107 1100.2591 1709.174 1877.379 1341.427 1392.500 1884.818
## STS 1230.239 1248.235  988.6774 1763.684 1974.712 1124.852 1304.105 1885.544
## POP 2319.742 2346.521 2315.5228 2777.366 2876.039 2566.256 2591.385 3000.557
## SPB 2262.516 2289.859 2313.3538 2686.566 2760.206 2594.045 2569.099 2924.845
## SPS 2225.649 2253.121 2340.8167 2601.355 2642.916 2651.131 2568.416 2854.982
## SPC 1697.515 1724.957 1780.6744 2115.539 2193.176 2085.530 2019.617 2354.477
##          TUH      SEV      ALU      KER      KRA      SOC      TIK      RAR
## FRS 2897.030 2179.621 2245.512 2390.859 2596.424 2688.509 2660.271 2758.005
## STS 2745.357 2012.146 2071.666 2197.341 2400.939 2511.584 2447.357 2555.484
## POP 4103.736 3397.015 3464.308 3613.507 3818.891 3906.072 3885.080 3981.365
## SPB 4076.402 3387.944 3457.259 3614.231 3818.203 3894.554 3893.085 3982.756
## SPS 4053.790 3392.537 3463.561 3628.220 3828.858 3891.912 3913.323 3994.595
## SPC 3518.971 2842.701 2913.061 3074.730 3276.925 3346.038 3357.781 3442.342
##          GES      ARM
## FRS 2793.592 3207.499
## STS 2619.862 3045.177
## POP 4009.674 4417.211
## SPB 3995.220 4391.989
## SPS 3988.376 4369.793
## SPC 3444.704 3834.995

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)
##   Distance FST row_index col_index
## 1        0   0       FRS       FRS
## 2        0   0       STS       STS
## 3        0   0       POP       POP
## 4        0   0       SPB       SPB
## 5        0   0       SPS       SPS
## 6        0   0       SPC       SPC

Fit linear regression

data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14
                                   ))  # Increase font size for y-axis
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "fst_by_distance_europe_LD2.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

1.4 Subset by country

Select countries with at least 3 sampling localities in Europe

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops
## [1] "Spain"   "Italy"   "Albania" "Ukraine" "Russia"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  subset_data <- subset_data[subset_data$Distance > 0, ]
  # Perform linear regression
  lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $Spain
## $Spain$equation
## [1] "y = -0.01214x + 0.167"
## 
## $Spain$r2
## [1] "R^2 = 0.01"
## 
## 
## $Italy
## $Italy$equation
## [1] "y = -0.01374x + 0.188"
## 
## $Italy$r2
## [1] "R^2 = 0.03"
## 
## 
## $Albania
## $Albania$equation
## [1] "y = -0.01588x + 0.083"
## 
## $Albania$r2
## [1] "R^2 = 0.50"
## 
## 
## $Ukraine
## $Ukraine$equation
## [1] "y = -0.02907x + 0.228"
## 
## $Ukraine$r2
## [1] "R^2 = 0.90"
## 
## 
## $Russia
## $Russia$equation
## [1] "y = 0.01280x + -0.033"
## 
## $Russia$r2
## [1] "R^2 = 0.02"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("Spain", "Italy", "Albania", "Ukraine", "Russia")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(
    data = regression_results, 
    aes(label = paste(equation, r2, sep = "~~")), 
    x = Inf, y = Inf, 
    vjust = 2, hjust = 1.2, 
    size = 3.5, 
    parse = TRUE, 
    inherit.aes = FALSE
  ) +
  labs(title = "Fst vs Distance by Country",
       x = "Log(Distance)",
       y = "Fst(1-Fst)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "fst_by_distance_countries_Europe_LD2.pdf"),
  width = 6,
  height = 8,
  units = "in"
)

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     ALD    ALU    ALV    ARM    BAR    BRE    BUL    CES    CRO    DES   
## ALD NA     "0.12" "0.02" "0.11" "0.12" "0.17" "0.11" "0.20" "0.05" "0.16"
## ALU "1268" NA     "0.10" "0.06" "0.11" "0.16" "0.10" "0.19" "0.09" "0.14"
## ALV "93"   "1307" NA     "0.09" "0.10" "0.15" "0.09" "0.18" "0.03" "0.14"
## ARM "2144" "974"  "2159" NA     "0.11" "0.13" "0.09" "0.16" "0.08" "0.11"
## BAR "1446" "2631" "1458" "3566" NA     "0.17" "0.11" "0.20" "0.10" "0.16"
## BRE "887"  "1896" "942"  "2859" "796"  NA     "0.15" "0.16" "0.15" "0.09"
## BUL "414"  "895"  "483"  "1826" "1742" "1048" NA     "0.19" "0.09" "0.14"
## CES "673"  "1757" "724"  "2706" "878"  "223"  "881"  NA     "0.18" "0.14"
## CRO "180"  "1321" "260"  "2235" "1333" "719"  "428"  "514"  NA     "0.13"
## DES "859"  "1870" "915"  "2832" "815"  "28"   "1021" "198"  "691"  NA    
## FRS "1192" "2246" "1234" "3207" "512"  "349"  "1393" "525"  "1038" "375" 
## GES "1770" "550"  "1795" "429"  "3168" "2444" "1426" "2300" "1845" "2418"
## GRA "534"  "1151" "476"  "1848" "1900" "1417" "656"  "1200" "711"  "1390"
## GRC "754"  "1348" "679"  "1937" "2007" "1611" "925"  "1389" "934"  "1585"
## IMP "984"  "2097" "1020" "3047" "552"  "254"  "1221" "341"  "840"  "268" 
## ITB "752"  "1821" "802"  "2773" "824"  "148"  "950"  "79"   "591"  "126" 
## ITP "224"  "1482" "235"  "2367" "1227" "728"  "604"  "506"  "201"  "702" 
## ITR "585"  "1794" "605"  "2711" "862"  "445"  "899"  "252"  "477"  "427" 
## KER "1443" "179"  "1484" "862"  "2789" "2042" "1062" "1913" "1490" "2017"
## KRA "1633" "366"  "1670" "674"  "2992" "2248" "1261" "2117" "1687" "2222"
## MAL "746"  "1948" "674"  "2719" "1233" "1135" "1157" "943"  "817"  "1119"
## POP "2320" "3464" "2335" "4417" "878"  "1572" "2591" "1711" "2197" "1597"
## RAR "1798" "532"  "1833" "560"  "3157" "2410" "1427" "2281" "1853" "2385"
## ROM "585"  "1794" "605"  "2711" "862"  "445"  "899"  "252"  "477"  "427" 
## ROS "771"  "950"  "859"  "1923" "1783" "1000" "445"  "917"  "683"  "977" 
## SER "442"  "1149" "534"  "2107" "1489" "752"  "316"  "612"  "322"  "725" 
## SEV "1196" "72"   "1235" "1034" "2561" "1830" "824"  "1689" "1250" "1804"
## SIC "480"  "1724" "420"  "2545" "1195" "925"  "894"  "715"  "537"  "905" 
## SLO "689"  "1617" "761"  "2581" "1059" "280"  "782"  "227"  "509"  "255" 
## SOC "1678" "443"  "1706" "538"  "3068" "2339" "1327" "2197" "1748" "2313"
## SPB "2263" "3457" "2266" "4392" "827"  "1595" "2569" "1702" "2158" "1617"
## SPC "1698" "2913" "1698" "3835" "310"  "1104" "2020" "1173" "1601" "1122"
## SPM "1450" "2677" "1448" "3591" "211"  "919"  "1782" "956"  "1360" "933" 
## SPS "2226" "3464" "2215" "4370" "874"  "1670" "2568" "1737" "2144" "1689"
## STS "1230" "2072" "1297" "3045" "916"  "390"  "1304" "604"  "1053" "410" 
## TIK "1733" "467"  "1774" "687"  "3069" "2313" "1350" "2191" "1778" "2288"
## TIR "27"   "1242" "99"   "2117" "1473" "908"  "393"  "696"  "196"  "881" 
## TRE "858"  "1821" "920"  "2787" "887"  "92"   "986"  "230"  "684"  "80"  
## TUA "694"  "904"  "667"  "1571" "2123" "1569" "641"  "1363" "851"  "1542"
## TUH "1828" "679"  "1844" "316"  "3251" "2549" "1512" "2393" "1919" "2522"
##     FRS    GES    GRA    GRC    IMP    ITB    ITP    ITR    KER    KRA   
## ALD "0.09" "0.14" "0.08" "0.07" "0.12" "0.12" "0.08" "0.08" "0.12" "0.11"
## ALU "0.08" "0.08" "0.08" "0.09" "0.10" "0.11" "0.08" "0.07" "0.07" "0.05"
## ALV "0.07" "0.12" "0.06" "0.05" "0.08" "0.10" "0.06" "0.06" "0.10" "0.09"
## ARM "0.06" "0.05" "0.08" "0.08" "0.08" "0.09" "0.06" "0.07" "0.04" "0.02"
## BAR "0.09" "0.14" "0.09" "0.10" "0.11" "0.13" "0.08" "0.07" "0.12" "0.11"
## BRE "0.12" "0.18" "0.15" "0.15" "0.13" "0.12" "0.12" "0.14" "0.15" "0.15"
## BUL "0.07" "0.13" "0.09" "0.09" "0.09" "0.10" "0.06" "0.05" "0.11" "0.10"
## CES "0.15" "0.21" "0.18" "0.18" "0.18" "0.17" "0.15" "0.17" "0.18" "0.18"
## CRO "0.06" "0.12" "0.06" "0.05" "0.08" "0.09" "0.05" "0.05" "0.10" "0.09"
## DES "0.10" "0.17" "0.14" "0.14" "0.12" "0.11" "0.10" "0.12" "0.14" "0.14"
## FRS NA     "0.11" "0.07" "0.07" "0.05" "0.06" "0.04" "0.04" "0.08" "0.08"
## GES "2794" NA     "0.11" "0.12" "0.13" "0.14" "0.10" "0.10" "0.05" "0.02"
## GRA "1709" "1540" NA     "0.02" "0.08" "0.09" "0.05" "0.05" "0.09" "0.08"
## GRC "1877" "1675" "270"  NA     "0.09" "0.10" "0.06" "0.06" "0.10" "0.09"
## IMP "228"  "2640" "1492" "1653" NA     "0.08" "0.05" "0.05" "0.09" "0.09"
## ITB "448"  "2365" "1279" "1467" "277"  NA     "0.07" "0.07" "0.11" "0.11"
## ITP "1004" "1991" "705"  "884"  "787"  "583"  NA     "0.02" "0.08" "0.07"
## ITR "654"  "2321" "1070" "1224" "428"  "302"  "371"  NA     "0.08" "0.07"
## KER "2391" "434"  "1327" "1517" "2250" "1973" "1655" "1960" NA     "0.03"
## KRA "2596" "257"  "1478" "1648" "2455" "2178" "1848" "2160" "206"  NA    
## MAL "1270" "2395" "873"  "862"  "1049" "996"  "621"  "694"  "2127" "2301"
## POP "1225" "4010" "2777" "2876" "1370" "1645" "2103" "1734" "3614" "3819"
## RAR "2758" "210"  "1629" "1789" "2619" "2342" "2014" "2326" "369"  "166" 
## ROM "654"  "2321" "1070" "1224" "428"  "302"  "371"  "0"    "1960" "2160"
## ROS "1341" "1496" "1100" "1370" "1232" "961"  "884"  "1047" "1073" "1276"
## SER "1100" "1695" "884"  "1141" "949"  "672"  "520"  "697"  "1301" "1506"
## SEV "2180" "614"  "1086" "1288" "2029" "1753" "1410" "1723" "250"  "438" 
## SIC "1119" "2197" "736"  "812"  "891"  "779"  "340"  "483"  "1901" "2084"
## SLO "629"  "2166" "1220" "1440" "507"  "245"  "585"  "455"  "1762" "1968"
## SOC "2689" "109"  "1470" "1618" "2537" "2262" "1898" "2224" "325"  "161" 
## SPB "1264" "3995" "2687" "2760" "1367" "1644" "2040" "1683" "3614" "3818"
## SPC "818"  "3445" "2116" "2193" "857"  "1125" "1474" "1124" "3075" "3277"
## SPM "684"  "3204" "1864" "1944" "665"  "916"  "1227" "883"  "2841" "3042"
## SPS "1371" "3988" "2601" "2643" "1424" "1690" "2002" "1669" "3628" "3829"
## STS "412"  "2620" "1764" "1975" "528"  "536"  "1098" "835"  "2197" "2401"
## TIK "2660" "315"  "1600" "1776" "2527" "2250" "1945" "2247" "290"  "132" 
## TIR "1216" "1743" "515"  "742"  "1009" "775"  "251"  "612"  "1417" "1607"
## TRE "428"  "2370" "1391" "1596" "346"  "176"  "717"  "476"  "1964" "2170"
## TUA "1885" "1265" "277"  "445"  "1678" "1441" "901"  "1271" "1075" "1214"
## TUH "2897" "194"  "1544" "1650" "2734" "2461" "2051" "2395" "597"  "443" 
##     MAL    POP    RAR    ROM    ROS    SER    SEV    SIC    SLO    SOC   
## ALD "0.09" "0.09" "0.15" "0.10" "0.12" "0.22" "0.16" "0.11" "0.08" "0.14"
## ALU "0.07" "0.08" "0.07" "0.08" "0.11" "0.20" "0.10" "0.10" "0.07" "0.07"
## ALV "0.06" "0.07" "0.12" "0.07" "0.10" "0.18" "0.14" "0.09" "0.06" "0.12"
## ARM "0.06" "0.05" "0.05" "0.08" "0.09" "0.19" "0.07" "0.08" "0.05" "0.04"
## BAR "0.08" "0.09" "0.14" "0.08" "0.12" "0.22" "0.16" "0.10" "0.08" "0.13"
## BRE "0.12" "0.11" "0.18" "0.15" "0.13" "0.25" "0.19" "0.12" "0.10" "0.17"
## BUL "0.06" "0.07" "0.13" "0.07" "0.10" "0.20" "0.15" "0.08" "0.07" "0.13"
## CES "0.15" "0.13" "0.21" "0.18" "0.17" "0.28" "0.22" "0.16" "0.13" "0.20"
## CRO "0.06" "0.06" "0.12" "0.06" "0.10" "0.17" "0.13" "0.08" "0.06" "0.11"
## DES "0.11" "0.08" "0.17" "0.13" "0.11" "0.23" "0.18" "0.09" "0.08" "0.16"
## FRS "0.04" "0.04" "0.11" "0.04" "0.07" "0.16" "0.12" "0.06" "0.04" "0.10"
## GES "0.10" "0.10" "0.03" "0.11" "0.14" "0.23" "0.06" "0.12" "0.09" "0.01"
## GRA "0.05" "0.06" "0.11" "0.05" "0.10" "0.18" "0.12" "0.08" "0.05" "0.10"
## GRC "0.06" "0.07" "0.12" "0.06" "0.10" "0.19" "0.13" "0.08" "0.06" "0.11"
## IMP "0.05" "0.05" "0.13" "0.06" "0.10" "0.23" "0.15" "0.07" "0.06" "0.12"
## ITB "0.06" "0.05" "0.14" "0.08" "0.10" "0.22" "0.16" "0.08" "0.06" "0.13"
## ITP "0.02" "0.04" "0.11" "0.02" "0.08" "0.17" "0.12" "0.04" "0.04" "0.10"
## ITR "0.02" "0.03" "0.10" "0.01" "0.08" "0.17" "0.12" "0.05" "0.04" "0.10"
## KER "0.07" "0.07" "0.05" "0.09" "0.11" "0.20" "0.07" "0.10" "0.07" "0.05"
## KRA "0.07" "0.07" "0.01" "0.08" "0.10" "0.20" "0.03" "0.09" "0.06" "0.01"
## MAL NA     "0.03" "0.10" "0.02" "0.07" "0.16" "0.12" "0.05" "0.04" "0.09"
## POP "2067" NA     "0.11" "0.04" "0.07" "0.17" "0.12" "0.05" "0.04" "0.10"
## RAR "2460" "3981" NA     "0.11" "0.14" "0.23" "0.05" "0.12" "0.10" "0.02"
## ROM "694"  "1734" "2326" NA     "0.09" "0.21" "0.13" "0.06" "0.04" "0.10"
## ROS "1498" "2566" "1432" "1047" NA     "0.20" "0.15" "0.09" "0.06" "0.13"
## SER "1139" "2316" "1670" "697"  "367"  NA     "0.25" "0.19" "0.15" "0.22"
## SEV "1878" "3397" "604"  "1723" "894"  "1082" NA     "0.14" "0.11" "0.05"
## SIC "280"  "2065" "2246" "483"  "1219" "858"  "1652" NA     "0.05" "0.12"
## SLO "1118" "1852" "2130" "455"  "726"  "476"  "1551" "865"  NA     "0.09"
## SOC "2316" "3906" "187"  "2224" "1388" "1591" "509"  "2113" "2060" NA    
## SPB "1923" "283"  "3983" "1683" "2594" "2313" "3388" "1957" "1869" "3895"
## SPC "1368" "702"  "3442" "1124" "2086" "1781" "2843" "1387" "1363" "3346"
## SPM "1128" "939"  "3208" "883"  "1873" "1555" "2606" "1136" "1160" "3107"
## SPS "1785" "615"  "3995" "1669" "2651" "2341" "3393" "1865" "1930" "3892"
## STS "1523" "1509" "2555" "835"  "1125" "989"  "2012" "1314" "546"  "2512"
## TIK "2414" "3885" "127"  "2247" "1328" "1580" "539"  "2190" "2033" "253" 
## TIR "764"  "2347" "1772" "612"  "761"  "439"  "1170" "502"  "705"  "1651"
## TRE "1170" "1653" "2331" "476"  "915"  "683"  "1756" "945"  "206"  "2264"
## TUA "1148" "3001" "1361" "1271" "1057" "932"  "843"  "994"  "1340" "1198"
## TUH "2412" "4104" "398"  "2395" "1629" "1797" "734"  "2233" "2272" "283" 
##     SPB    SPC    SPM    SPS    STS    TIK    TIR     TRE    TUA    TUH   
## ALD "0.12" "0.13" "0.10" "0.14" "0.10" "0.16" "0.03"  "0.08" "0.10" "0.09"
## ALU "0.10" "0.11" "0.09" "0.12" "0.09" "0.09" "0.10"  "0.07" "0.09" "0.08"
## ALV "0.09" "0.11" "0.08" "0.11" "0.08" "0.13" "-0.00" "0.06" "0.08" "0.07"
## ARM "0.06" "0.07" "0.06" "0.11" "0.07" "0.07" "0.09"  "0.05" "0.09" "0.07"
## BAR "0.12" "0.13" "0.10" "0.11" "0.09" "0.15" "0.11"  "0.08" "0.06" "0.05"
## BRE "0.10" "0.12" "0.11" "0.18" "0.12" "0.19" "0.17"  "0.09" "0.16" "0.13"
## BUL "0.10" "0.11" "0.09" "0.12" "0.08" "0.15" "0.10"  "0.06" "0.08" "0.07"
## CES "0.13" "0.15" "0.14" "0.21" "0.15" "0.22" "0.20"  "0.11" "0.19" "0.16"
## CRO "0.09" "0.10" "0.08" "0.11" "0.07" "0.13" "0.02"  "0.05" "0.08" "0.07"
## DES "0.08" "0.10" "0.09" "0.16" "0.11" "0.18" "0.15"  "0.07" "0.14" "0.12"
## FRS "0.06" "0.07" "0.05" "0.09" "0.03" "0.12" "0.07"  "0.03" "0.07" "0.05"
## GES "0.12" "0.13" "0.12" "0.15" "0.11" "0.05" "0.13"  "0.10" "0.12" "0.11"
## GRA "0.09" "0.10" "0.08" "0.10" "0.07" "0.12" "0.06"  "0.06" "0.07" "0.06"
## GRC "0.09" "0.10" "0.08" "0.11" "0.07" "0.13" "0.06"  "0.06" "0.08" "0.07"
## IMP "0.07" "0.08" "0.07" "0.10" "0.05" "0.15" "0.10"  "0.04" "0.08" "0.06"
## ITB "0.08" "0.09" "0.07" "0.13" "0.07" "0.16" "0.11"  "0.04" "0.10" "0.08"
## ITP "0.07" "0.08" "0.05" "0.09" "0.04" "0.12" "0.06"  "0.03" "0.06" "0.04"
## ITR "0.08" "0.09" "0.06" "0.07" "0.04" "0.11" "0.06"  "0.04" "0.04" "0.04"
## KER "0.09" "0.10" "0.09" "0.13" "0.08" "0.07" "0.11"  "0.07" "0.10" "0.09"
## KRA "0.09" "0.10" "0.08" "0.12" "0.08" "0.04" "0.10"  "0.07" "0.09" "0.08"
## MAL "0.06" "0.08" "0.05" "0.07" "0.04" "0.11" "0.07"  "0.03" "0.05" "0.04"
## POP "0.05" "0.06" "0.04" "0.09" "0.04" "0.12" "0.07"  "0.03" "0.06" "0.05"
## RAR "0.13" "0.14" "0.12" "0.16" "0.11" "0.05" "0.14"  "0.10" "0.12" "0.11"
## ROM "0.08" "0.10" "0.07" "0.09" "0.04" "0.13" "0.07"  "0.03" "0.05" "0.04"
## ROS "0.09" "0.10" "0.09" "0.13" "0.07" "0.15" "0.11"  "0.05" "0.10" "0.08"
## SER "0.20" "0.22" "0.21" "0.24" "0.18" "0.25" "0.22"  "0.15" "0.20" "0.18"
## SEV "0.14" "0.15" "0.14" "0.17" "0.13" "0.08" "0.15"  "0.11" "0.14" "0.12"
## SIC "0.08" "0.09" "0.06" "0.11" "0.06" "0.14" "0.09"  "0.04" "0.08" "0.06"
## SLO "0.05" "0.06" "0.04" "0.09" "0.05" "0.11" "0.06"  "0.02" "0.06" "0.05"
## SOC "0.11" "0.12" "0.11" "0.15" "0.11" "0.06" "0.13"  "0.09" "0.12" "0.10"
## SPB NA     "0.01" "0.01" "0.12" "0.06" "0.14" "0.10"  "0.04" "0.10" "0.08"
## SPC "571"  NA     "0.02" "0.14" "0.07" "0.15" "0.11"  "0.05" "0.10" "0.08"
## SPM "823"  "252"  NA     "0.05" "0.05" "0.13" "0.10"  "0.03" "0.08" "0.06"
## SPS "332"  "567"  "787"  NA     "0.10" "0.17" "0.13"  "0.08" "0.03" "0.09"
## STS "1601" "1214" "1096" "1750" NA     "0.12" "0.08"  "0.03" "0.07" "0.05"
## TIK "3893" "3358" "3126" "3913" "2447" NA     "0.15"  "0.11" "0.13" "0.12"
## TIR "2290" "1725" "1478" "2253" "1248" "1707" NA      "0.06" "0.08" "0.07"
## TRE "1682" "1196" "1011" "1761" "381"  "2233" "877"   NA     "0.06" "0.04"
## TUA "2925" "2354" "2103" "2855" "1886" "1339" "669"   "1529" NA     "0.05"
## TUH "4076" "3519" "3275" "4054" "2745" "508"  "1801"  "2478" "1267" NA
cities <- readRDS(here("output", "sampling_loc_euro_global.rds"))
cities <- as_tibble(cities)
head(cities)
## # A tibble: 6 × 12
##   Pop_City     Country Latitude Longitude Continent Abbreviation Year  Region   
##   <chr>        <chr>      <dbl>     <dbl> <chr>     <chr>        <chr> <chr>    
## 1 Berlin, NJ   USA        39.8      -74.9 Americas  BER          2018  North Am…
## 2 Columbus, OH USA        40.0      -82.9 Americas  COL          2015  North Am…
## 3 Palm Beach   USA        26.7      -80.0 Americas  PAL          2018  North Am…
## 4 Houston, TX  USA        29.8      -95.4 Americas  HOU          2018  North Am…
## 5 Los Angeles  USA        34.1     -118.  Americas  LOS          2018  North Am…
## 6 Manaus, AM   Brazil     -3.09     -60.0 Americas  MAU          2017  South Am…
## # ℹ 4 more variables: Subregion <chr>, order <int>, order2 <int>,
## #   orderold <int>

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     CRO    ALD    ITP    TIR    SER    ALV    CES    ITR    ROM    SLO   
## CRO NA     "0.05" "0.05" "0.02" "0.17" "0.03" "0.18" "0.05" "0.06" "0.06"
## ALD "180"  NA     "0.08" "0.03" "0.22" "0.02" "0.20" "0.08" "0.10" "0.08"
## ITP "201"  "224"  NA     "0.06" "0.17" "0.06" "0.15" "0.02" "0.02" "0.04"
## TIR "196"  "27"   "251"  NA     "0.22" "0.00" "0.20" "0.06" "0.07" "0.06"
## SER "322"  "442"  "520"  "439"  NA     "0.18" "0.28" "0.17" "0.21" "0.15"
## ALV "260"  "93"   "235"  "99"   "534"  NA     "0.18" "0.06" "0.07" "0.06"
## CES "514"  "673"  "506"  "696"  "612"  "724"  NA     "0.17" "0.18" "0.13"
## ITR "477"  "585"  "371"  "612"  "697"  "605"  "252"  NA     "0.01" "0.04"
## ROM "477"  "585"  "371"  "612"  "697"  "605"  "252"  "0"    NA     "0.04"
## SLO "509"  "689"  "585"  "705"  "476"  "761"  "227"  "455"  "455"  NA    
## ITB "591"  "752"  "583"  "775"  "672"  "802"  "79"   "302"  "302"  "245" 
## BUL "428"  "414"  "604"  "393"  "316"  "483"  "881"  "899"  "899"  "782" 
## DES "691"  "859"  "702"  "881"  "725"  "915"  "198"  "427"  "427"  "255" 
## TRE "684"  "858"  "717"  "877"  "683"  "920"  "230"  "476"  "476"  "206" 
## BRE "719"  "887"  "728"  "908"  "752"  "942"  "223"  "445"  "445"  "280" 
## SIC "537"  "480"  "340"  "502"  "858"  "420"  "715"  "483"  "483"  "865" 
## ROS "683"  "771"  "884"  "761"  "367"  "859"  "917"  "1047" "1047" "726" 
## IMP "840"  "984"  "787"  "1009" "949"  "1020" "341"  "428"  "428"  "507" 
## GRA "711"  "534"  "705"  "515"  "884"  "476"  "1200" "1070" "1070" "1220"
## TUA "851"  "694"  "901"  "669"  "932"  "667"  "1363" "1271" "1271" "1340"
## FRS "1038" "1192" "1004" "1216" "1100" "1234" "525"  "654"  "654"  "629" 
## MAL "817"  "746"  "621"  "764"  "1139" "674"  "943"  "694"  "694"  "1118"
## STS "1053" "1230" "1098" "1248" "989"  "1297" "604"  "835"  "835"  "546" 
## GRC "934"  "754"  "884"  "742"  "1141" "679"  "1389" "1224" "1224" "1440"
## SEV "1250" "1196" "1410" "1170" "1082" "1235" "1689" "1723" "1723" "1551"
## ALU "1321" "1268" "1482" "1242" "1149" "1307" "1757" "1794" "1794" "1617"
## BAR "1333" "1446" "1227" "1473" "1489" "1458" "878"  "862"  "862"  "1059"
## SPM "1360" "1450" "1227" "1478" "1555" "1448" "956"  "883"  "883"  "1160"
## KER "1490" "1443" "1655" "1417" "1301" "1484" "1913" "1960" "1960" "1762"
## KRA "1687" "1633" "1848" "1607" "1506" "1670" "2117" "2160" "2160" "1968"
## SPC "1601" "1698" "1474" "1725" "1781" "1698" "1173" "1124" "1124" "1363"
## SOC "1748" "1678" "1898" "1651" "1591" "1706" "2197" "2224" "2224" "2060"
## TIK "1778" "1733" "1945" "1707" "1580" "1774" "2191" "2247" "2247" "2033"
## GES "1845" "1770" "1991" "1743" "1695" "1795" "2300" "2321" "2321" "2166"
## RAR "1853" "1798" "2014" "1772" "1670" "1833" "2281" "2326" "2326" "2130"
## TUH "1919" "1828" "2051" "1801" "1797" "1844" "2393" "2395" "2395" "2272"
## ARM "2235" "2144" "2367" "2117" "2107" "2159" "2706" "2711" "2711" "2581"
## SPB "2158" "2263" "2040" "2290" "2313" "2266" "1702" "1683" "1683" "1869"
## SPS "2144" "2226" "2002" "2253" "2341" "2215" "1737" "1669" "1669" "1930"
## POP "2197" "2320" "2103" "2347" "2316" "2335" "1711" "1734" "1734" "1852"
##     ITB    BUL    DES    TRE    BRE    SIC    ROS    IMP    GRA    TUA   
## CRO "0.09" "0.09" "0.13" "0.05" "0.15" "0.08" "0.10" "0.08" "0.06" "0.08"
## ALD "0.12" "0.11" "0.16" "0.08" "0.17" "0.11" "0.12" "0.12" "0.08" "0.10"
## ITP "0.07" "0.06" "0.10" "0.03" "0.12" "0.04" "0.08" "0.05" "0.05" "0.06"
## TIR "0.11" "0.10" "0.15" "0.06" "0.17" "0.09" "0.11" "0.10" "0.06" "0.08"
## SER "0.22" "0.20" "0.23" "0.15" "0.25" "0.19" "0.20" "0.23" "0.18" "0.20"
## ALV "0.10" "0.09" "0.14" "0.06" "0.15" "0.09" "0.10" "0.08" "0.06" "0.08"
## CES "0.17" "0.19" "0.14" "0.11" "0.16" "0.16" "0.17" "0.18" "0.18" "0.19"
## ITR "0.07" "0.05" "0.12" "0.04" "0.14" "0.05" "0.08" "0.05" "0.05" "0.04"
## ROM "0.08" "0.07" "0.13" "0.03" "0.15" "0.06" "0.09" "0.06" "0.05" "0.05"
## SLO "0.06" "0.07" "0.08" "0.02" "0.10" "0.05" "0.06" "0.06" "0.05" "0.06"
## ITB NA     "0.10" "0.11" "0.04" "0.12" "0.08" "0.10" "0.08" "0.09" "0.10"
## BUL "950"  NA     "0.14" "0.06" "0.15" "0.08" "0.10" "0.09" "0.09" "0.08"
## DES "126"  "1021" NA     "0.07" "0.09" "0.09" "0.11" "0.12" "0.14" "0.14"
## TRE "176"  "986"  "80"   NA     "0.09" "0.04" "0.05" "0.04" "0.06" "0.06"
## BRE "148"  "1048" "28"   "92"   NA     "0.12" "0.13" "0.13" "0.15" "0.16"
## SIC "779"  "894"  "905"  "945"  "925"  NA     "0.09" "0.07" "0.08" "0.08"
## ROS "961"  "445"  "977"  "915"  "1000" "1219" NA     "0.10" "0.10" "0.10"
## IMP "277"  "1221" "268"  "346"  "254"  "891"  "1232" NA     "0.08" "0.08"
## GRA "1279" "656"  "1390" "1391" "1417" "736"  "1100" "1492" NA     "0.07"
## TUA "1441" "641"  "1542" "1529" "1569" "994"  "1057" "1678" "277"  NA    
## FRS "448"  "1393" "375"  "428"  "349"  "1119" "1341" "228"  "1709" "1885"
## MAL "996"  "1157" "1119" "1170" "1135" "280"  "1498" "1049" "873"  "1148"
## STS "536"  "1304" "410"  "381"  "390"  "1314" "1125" "528"  "1764" "1886"
## GRC "1467" "925"  "1585" "1596" "1611" "812"  "1370" "1653" "270"  "445" 
## SEV "1753" "824"  "1804" "1756" "1830" "1652" "894"  "2029" "1086" "843" 
## ALU "1821" "895"  "1870" "1821" "1896" "1724" "950"  "2097" "1151" "904" 
## BAR "824"  "1742" "815"  "887"  "796"  "1195" "1783" "552"  "1900" "2123"
## SPM "916"  "1782" "933"  "1011" "919"  "1136" "1873" "665"  "1864" "2103"
## KER "1973" "1062" "2017" "1964" "2042" "1901" "1073" "2250" "1327" "1075"
## KRA "2178" "1261" "2222" "2170" "2248" "2084" "1276" "2455" "1478" "1214"
## SPC "1125" "2020" "1122" "1196" "1104" "1387" "2086" "857"  "2116" "2354"
## SOC "2262" "1327" "2313" "2264" "2339" "2113" "1388" "2537" "1470" "1198"
## TIK "2250" "1350" "2288" "2233" "2313" "2190" "1328" "2527" "1600" "1339"
## GES "2365" "1426" "2418" "2370" "2444" "2197" "1496" "2640" "1540" "1265"
## RAR "2342" "1427" "2385" "2331" "2410" "2246" "1432" "2619" "1629" "1361"
## TUH "2461" "1512" "2522" "2478" "2549" "2233" "1629" "2734" "1544" "1267"
## ARM "2773" "1826" "2832" "2787" "2859" "2545" "1923" "3047" "1848" "1571"
## SPB "1644" "2569" "1617" "1682" "1595" "1957" "2594" "1367" "2687" "2925"
## SPS "1690" "2568" "1689" "1761" "1670" "1865" "2651" "1424" "2601" "2855"
## POP "1645" "2591" "1597" "1653" "1572" "2065" "2566" "1370" "2777" "3001"
##     FRS    MAL    STS    GRC    SEV    ALU    BAR    SPM    KER    KRA   
## CRO "0.06" "0.06" "0.07" "0.05" "0.13" "0.09" "0.10" "0.08" "0.10" "0.09"
## ALD "0.09" "0.09" "0.10" "0.07" "0.16" "0.12" "0.12" "0.10" "0.12" "0.11"
## ITP "0.04" "0.02" "0.04" "0.06" "0.12" "0.08" "0.08" "0.05" "0.08" "0.07"
## TIR "0.07" "0.07" "0.08" "0.06" "0.15" "0.10" "0.11" "0.10" "0.11" "0.10"
## SER "0.16" "0.16" "0.18" "0.19" "0.25" "0.20" "0.22" "0.21" "0.20" "0.20"
## ALV "0.07" "0.06" "0.08" "0.05" "0.14" "0.10" "0.10" "0.08" "0.10" "0.09"
## CES "0.15" "0.15" "0.15" "0.18" "0.22" "0.19" "0.20" "0.14" "0.18" "0.18"
## ITR "0.04" "0.02" "0.04" "0.06" "0.12" "0.07" "0.07" "0.06" "0.08" "0.07"
## ROM "0.04" "0.02" "0.04" "0.06" "0.13" "0.08" "0.08" "0.07" "0.09" "0.08"
## SLO "0.04" "0.04" "0.05" "0.06" "0.11" "0.07" "0.08" "0.04" "0.07" "0.06"
## ITB "0.06" "0.06" "0.07" "0.10" "0.16" "0.11" "0.13" "0.07" "0.11" "0.11"
## BUL "0.07" "0.06" "0.08" "0.09" "0.15" "0.10" "0.11" "0.09" "0.11" "0.10"
## DES "0.10" "0.11" "0.11" "0.14" "0.18" "0.14" "0.16" "0.09" "0.14" "0.14"
## TRE "0.03" "0.03" "0.03" "0.06" "0.11" "0.07" "0.08" "0.03" "0.07" "0.07"
## BRE "0.12" "0.12" "0.12" "0.15" "0.19" "0.16" "0.17" "0.11" "0.15" "0.15"
## SIC "0.06" "0.05" "0.06" "0.08" "0.14" "0.10" "0.10" "0.06" "0.10" "0.09"
## ROS "0.07" "0.07" "0.07" "0.10" "0.15" "0.11" "0.12" "0.09" "0.11" "0.10"
## IMP "0.05" "0.05" "0.05" "0.09" "0.15" "0.10" "0.11" "0.07" "0.09" "0.09"
## GRA "0.07" "0.05" "0.07" "0.02" "0.12" "0.08" "0.09" "0.08" "0.09" "0.08"
## TUA "0.07" "0.05" "0.07" "0.08" "0.14" "0.09" "0.06" "0.08" "0.10" "0.09"
## FRS NA     "0.04" "0.03" "0.07" "0.12" "0.08" "0.09" "0.05" "0.08" "0.08"
## MAL "1270" NA     "0.04" "0.06" "0.12" "0.07" "0.08" "0.05" "0.07" "0.07"
## STS "412"  "1523" NA     "0.07" "0.13" "0.09" "0.09" "0.05" "0.08" "0.08"
## GRC "1877" "862"  "1975" NA     "0.13" "0.09" "0.10" "0.08" "0.10" "0.09"
## SEV "2180" "1878" "2012" "1288" NA     "0.10" "0.16" "0.14" "0.07" "0.03"
## ALU "2246" "1948" "2072" "1348" "72"   NA     "0.11" "0.09" "0.07" "0.05"
## BAR "512"  "1233" "916"  "2007" "2561" "2631" NA     "0.10" "0.12" "0.11"
## SPM "684"  "1128" "1096" "1944" "2606" "2677" "211"  NA     "0.09" "0.08"
## KER "2391" "2127" "2197" "1517" "250"  "179"  "2789" "2841" NA     "0.03"
## KRA "2596" "2301" "2401" "1648" "438"  "366"  "2992" "3042" "206"  NA    
## SPC "818"  "1368" "1214" "2193" "2843" "2913" "310"  "252"  "3075" "3277"
## SOC "2689" "2316" "2512" "1618" "509"  "443"  "3068" "3107" "325"  "161" 
## TIK "2660" "2414" "2447" "1776" "539"  "467"  "3069" "3126" "290"  "132" 
## GES "2794" "2395" "2620" "1675" "614"  "550"  "3168" "3204" "434"  "257" 
## RAR "2758" "2460" "2555" "1789" "604"  "532"  "3157" "3208" "369"  "166" 
## TUH "2897" "2412" "2745" "1650" "734"  "679"  "3251" "3275" "597"  "443" 
## ARM "3207" "2719" "3045" "1937" "1034" "974"  "3566" "3591" "862"  "674" 
## SPB "1264" "1923" "1601" "2760" "3388" "3457" "827"  "823"  "3614" "3818"
## SPS "1371" "1785" "1750" "2643" "3393" "3464" "874"  "787"  "3628" "3829"
## POP "1225" "2067" "1509" "2876" "3397" "3464" "878"  "939"  "3614" "3819"
##     SPC    SOC    TIK    GES    RAR    TUH    ARM    SPB    SPS    POP   
## CRO "0.10" "0.11" "0.13" "0.12" "0.12" "0.07" "0.08" "0.09" "0.11" "0.06"
## ALD "0.13" "0.14" "0.16" "0.14" "0.15" "0.09" "0.11" "0.12" "0.14" "0.09"
## ITP "0.08" "0.10" "0.12" "0.10" "0.11" "0.04" "0.06" "0.07" "0.09" "0.04"
## TIR "0.11" "0.13" "0.15" "0.13" "0.14" "0.07" "0.09" "0.10" "0.13" "0.07"
## SER "0.22" "0.22" "0.25" "0.23" "0.23" "0.18" "0.19" "0.20" "0.24" "0.17"
## ALV "0.11" "0.12" "0.13" "0.12" "0.12" "0.07" "0.09" "0.09" "0.11" "0.07"
## CES "0.15" "0.20" "0.22" "0.21" "0.21" "0.16" "0.16" "0.13" "0.21" "0.13"
## ITR "0.09" "0.10" "0.11" "0.10" "0.10" "0.04" "0.07" "0.08" "0.07" "0.03"
## ROM "0.10" "0.10" "0.13" "0.11" "0.11" "0.04" "0.08" "0.08" "0.09" "0.04"
## SLO "0.06" "0.09" "0.11" "0.09" "0.10" "0.05" "0.05" "0.05" "0.09" "0.04"
## ITB "0.09" "0.13" "0.16" "0.14" "0.14" "0.08" "0.09" "0.08" "0.13" "0.05"
## BUL "0.11" "0.13" "0.15" "0.13" "0.13" "0.07" "0.09" "0.10" "0.12" "0.07"
## DES "0.10" "0.16" "0.18" "0.17" "0.17" "0.12" "0.11" "0.08" "0.16" "0.08"
## TRE "0.05" "0.09" "0.11" "0.10" "0.10" "0.04" "0.05" "0.04" "0.08" "0.03"
## BRE "0.12" "0.17" "0.19" "0.18" "0.18" "0.13" "0.13" "0.10" "0.18" "0.11"
## SIC "0.09" "0.12" "0.14" "0.12" "0.12" "0.06" "0.08" "0.08" "0.11" "0.05"
## ROS "0.10" "0.13" "0.15" "0.14" "0.14" "0.08" "0.09" "0.09" "0.13" "0.07"
## IMP "0.08" "0.12" "0.15" "0.13" "0.13" "0.06" "0.08" "0.07" "0.10" "0.05"
## GRA "0.10" "0.10" "0.12" "0.11" "0.11" "0.06" "0.08" "0.09" "0.10" "0.06"
## TUA "0.10" "0.12" "0.13" "0.12" "0.12" "0.05" "0.09" "0.10" "0.03" "0.06"
## FRS "0.07" "0.10" "0.12" "0.11" "0.11" "0.05" "0.06" "0.06" "0.09" "0.04"
## MAL "0.08" "0.09" "0.11" "0.10" "0.10" "0.04" "0.06" "0.06" "0.07" "0.03"
## STS "0.07" "0.11" "0.12" "0.11" "0.11" "0.05" "0.07" "0.06" "0.10" "0.04"
## GRC "0.10" "0.11" "0.13" "0.12" "0.12" "0.07" "0.08" "0.09" "0.11" "0.07"
## SEV "0.15" "0.05" "0.08" "0.06" "0.05" "0.12" "0.07" "0.14" "0.17" "0.12"
## ALU "0.11" "0.07" "0.09" "0.08" "0.07" "0.08" "0.06" "0.10" "0.12" "0.08"
## BAR "0.13" "0.13" "0.15" "0.14" "0.14" "0.05" "0.11" "0.12" "0.11" "0.09"
## SPM "0.02" "0.11" "0.13" "0.12" "0.12" "0.06" "0.06" "0.01" "0.05" "0.04"
## KER "0.10" "0.05" "0.07" "0.05" "0.05" "0.09" "0.04" "0.09" "0.13" "0.07"
## KRA "0.10" "0.01" "0.04" "0.02" "0.01" "0.08" "0.02" "0.09" "0.12" "0.07"
## SPC NA     "0.12" "0.15" "0.13" "0.14" "0.08" "0.07" "0.01" "0.14" "0.06"
## SOC "3346" NA     "0.06" "0.01" "0.02" "0.10" "0.04" "0.11" "0.15" "0.10"
## TIK "3358" "253"  NA     "0.05" "0.05" "0.12" "0.07" "0.14" "0.17" "0.12"
## GES "3445" "109"  "315"  NA     "0.03" "0.11" "0.05" "0.12" "0.15" "0.10"
## RAR "3442" "187"  "127"  "210"  NA     "0.11" "0.05" "0.13" "0.16" "0.11"
## TUH "3519" "283"  "508"  "194"  "398"  NA     "0.07" "0.08" "0.09" "0.05"
## ARM "3835" "538"  "687"  "429"  "560"  "316"  NA     "0.06" "0.11" "0.05"
## SPB "571"  "3895" "3893" "3995" "3983" "4076" "4392" NA     "0.12" "0.05"
## SPS "567"  "3892" "3913" "3988" "3995" "4054" "4370" "332"  NA     "0.09"
## POP "702"  "3906" "3885" "4010" "3981" "4104" "4417" "283"  "615"  NA

Make a table and save it as a word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]

merged_df1 <- as.data.frame(merged_df)
write.csv(merged_df1, "/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/fst/merged_df.csv")

# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df)) 

ft

Population

CRO

ALD

ITP

TIR

SER

ALV

CES

ITR

ROM

SLO

ITB

BUL

DES

TRE

BRE

SIC

ROS

IMP

GRA

TUA

FRS

MAL

STS

GRC

SEV

ALU

BAR

SPM

KER

KRA

SPC

SOC

TIK

GES

RAR

TUH

ARM

SPB

SPS

POP

CRO

0.05

0.05

0.02

0.17

0.03

0.18

0.05

0.06

0.06

0.09

0.09

0.13

0.05

0.15

0.08

0.10

0.08

0.06

0.08

0.06

0.06

0.07

0.05

0.13

0.09

0.10

0.08

0.10

0.09

0.10

0.11

0.13

0.12

0.12

0.07

0.08

0.09

0.11

0.06

ALD

180

0.08

0.03

0.22

0.02

0.20

0.08

0.10

0.08

0.12

0.11

0.16

0.08

0.17

0.11

0.12

0.12

0.08

0.10

0.09

0.09

0.10

0.07

0.16

0.12

0.12

0.10

0.12

0.11

0.13

0.14

0.16

0.14

0.15

0.09

0.11

0.12

0.14

0.09

ITP

201

224

0.06

0.17

0.06

0.15

0.02

0.02

0.04

0.07

0.06

0.10

0.03

0.12

0.04

0.08

0.05

0.05

0.06

0.04

0.02

0.04

0.06

0.12

0.08

0.08

0.05

0.08

0.07

0.08

0.10

0.12

0.10

0.11

0.04

0.06

0.07

0.09

0.04

TIR

196

27

251

0.22

0.00

0.20

0.06

0.07

0.06

0.11

0.10

0.15

0.06

0.17

0.09

0.11

0.10

0.06

0.08

0.07

0.07

0.08

0.06

0.15

0.10

0.11

0.10

0.11

0.10

0.11

0.13

0.15

0.13

0.14

0.07

0.09

0.10

0.13

0.07

SER

322

442

520

439

0.18

0.28

0.17

0.21

0.15

0.22

0.20

0.23

0.15

0.25

0.19

0.20

0.23

0.18

0.20

0.16

0.16

0.18

0.19

0.25

0.20

0.22

0.21

0.20

0.20

0.22

0.22

0.25

0.23

0.23

0.18

0.19

0.20

0.24

0.17

ALV

260

93

235

99

534

0.18

0.06

0.07

0.06

0.10

0.09

0.14

0.06

0.15

0.09

0.10

0.08

0.06

0.08

0.07

0.06

0.08

0.05

0.14

0.10

0.10

0.08

0.10

0.09

0.11

0.12

0.13

0.12

0.12

0.07

0.09

0.09

0.11

0.07

CES

514

673

506

696

612

724

0.17

0.18

0.13

0.17

0.19

0.14

0.11

0.16

0.16

0.17

0.18

0.18

0.19

0.15

0.15

0.15

0.18

0.22

0.19

0.20

0.14

0.18

0.18

0.15

0.20

0.22

0.21

0.21

0.16

0.16

0.13

0.21

0.13

ITR

477

585

371

612

697

605

252

0.01

0.04

0.07

0.05

0.12

0.04

0.14

0.05

0.08

0.05

0.05

0.04

0.04

0.02

0.04

0.06

0.12

0.07

0.07

0.06

0.08

0.07

0.09

0.10

0.11

0.10

0.10

0.04

0.07

0.08

0.07

0.03

ROM

477

585

371

612

697

605

252

0

0.04

0.08

0.07

0.13

0.03

0.15

0.06

0.09

0.06

0.05

0.05

0.04

0.02

0.04

0.06

0.13

0.08

0.08

0.07

0.09

0.08

0.10

0.10

0.13

0.11

0.11

0.04

0.08

0.08

0.09

0.04

SLO

509

689

585

705

476

761

227

455

455

0.06

0.07

0.08

0.02

0.10

0.05

0.06

0.06

0.05

0.06

0.04

0.04

0.05

0.06

0.11

0.07

0.08

0.04

0.07

0.06

0.06

0.09

0.11

0.09

0.10

0.05

0.05

0.05

0.09

0.04

ITB

591

752

583

775

672

802

79

302

302

245

0.10

0.11

0.04

0.12

0.08

0.10

0.08

0.09

0.10

0.06

0.06

0.07

0.10

0.16

0.11

0.13

0.07

0.11

0.11

0.09

0.13

0.16

0.14

0.14

0.08

0.09

0.08

0.13

0.05

BUL

428

414

604

393

316

483

881

899

899

782

950

0.14

0.06

0.15

0.08

0.10

0.09

0.09

0.08

0.07

0.06

0.08

0.09

0.15

0.10

0.11

0.09

0.11

0.10

0.11

0.13

0.15

0.13

0.13

0.07

0.09

0.10

0.12

0.07

DES

691

859

702

881

725

915

198

427

427

255

126

1021

0.07

0.09

0.09

0.11

0.12

0.14

0.14

0.10

0.11

0.11

0.14

0.18

0.14

0.16

0.09

0.14

0.14

0.10

0.16

0.18

0.17

0.17

0.12

0.11

0.08

0.16

0.08

TRE

684

858

717

877

683

920

230

476

476

206

176

986

80

0.09

0.04

0.05

0.04

0.06

0.06

0.03

0.03

0.03

0.06

0.11

0.07

0.08

0.03

0.07

0.07

0.05

0.09

0.11

0.10

0.10

0.04

0.05

0.04

0.08

0.03

BRE

719

887

728

908

752

942

223

445

445

280

148

1048

28

92

0.12

0.13

0.13

0.15

0.16

0.12

0.12

0.12

0.15

0.19

0.16

0.17

0.11

0.15

0.15

0.12

0.17

0.19

0.18

0.18

0.13

0.13

0.10

0.18

0.11

SIC

537

480

340

502

858

420

715

483

483

865

779

894

905

945

925

0.09

0.07

0.08

0.08

0.06

0.05

0.06

0.08

0.14

0.10

0.10

0.06

0.10

0.09

0.09

0.12

0.14

0.12

0.12

0.06

0.08

0.08

0.11

0.05

ROS

683

771

884

761

367

859

917

1047

1047

726

961

445

977

915

1000

1219

0.10

0.10

0.10

0.07

0.07

0.07

0.10

0.15

0.11

0.12

0.09

0.11

0.10

0.10

0.13

0.15

0.14

0.14

0.08

0.09

0.09

0.13

0.07

IMP

840

984

787

1009

949

1020

341

428

428

507

277

1221

268

346

254

891

1232

0.08

0.08

0.05

0.05

0.05

0.09

0.15

0.10

0.11

0.07

0.09

0.09

0.08

0.12

0.15

0.13

0.13

0.06

0.08

0.07

0.10

0.05

GRA

711

534

705

515

884

476

1200

1070

1070

1220

1279

656

1390

1391

1417

736

1100

1492

0.07

0.07

0.05

0.07

0.02

0.12

0.08

0.09

0.08

0.09

0.08

0.10

0.10

0.12

0.11

0.11

0.06

0.08

0.09

0.10

0.06

TUA

851

694

901

669

932

667

1363

1271

1271

1340

1441

641

1542

1529

1569

994

1057

1678

277

0.07

0.05

0.07

0.08

0.14

0.09

0.06

0.08

0.10

0.09

0.10

0.12

0.13

0.12

0.12

0.05

0.09

0.10

0.03

0.06

FRS

1038

1192

1004

1216

1100

1234

525

654

654

629

448

1393

375

428

349

1119

1341

228

1709

1885

0.04

0.03

0.07

0.12

0.08

0.09

0.05

0.08

0.08

0.07

0.10

0.12

0.11

0.11

0.05

0.06

0.06

0.09

0.04

MAL

817

746

621

764

1139

674

943

694

694

1118

996

1157

1119

1170

1135

280

1498

1049

873

1148

1270

0.04

0.06

0.12

0.07

0.08

0.05

0.07

0.07

0.08

0.09

0.11

0.10

0.10

0.04

0.06

0.06

0.07

0.03

STS

1053

1230

1098

1248

989

1297

604

835

835

546

536

1304

410

381

390

1314

1125

528

1764

1886

412

1523

0.07

0.13

0.09

0.09

0.05

0.08

0.08

0.07

0.11

0.12

0.11

0.11

0.05

0.07

0.06

0.10

0.04

GRC

934

754

884

742

1141

679

1389

1224

1224

1440

1467

925

1585

1596

1611

812

1370

1653

270

445

1877

862

1975

0.13

0.09

0.10

0.08

0.10

0.09

0.10

0.11

0.13

0.12

0.12

0.07

0.08

0.09

0.11

0.07

SEV

1250

1196

1410

1170

1082

1235

1689

1723

1723

1551

1753

824

1804

1756

1830

1652

894

2029

1086

843

2180

1878

2012

1288

0.10

0.16

0.14

0.07

0.03

0.15

0.05

0.08

0.06

0.05

0.12

0.07

0.14

0.17

0.12

ALU

1321

1268

1482

1242

1149

1307

1757

1794

1794

1617

1821

895

1870

1821

1896

1724

950

2097

1151

904

2246

1948

2072

1348

72

0.11

0.09

0.07

0.05

0.11

0.07

0.09

0.08

0.07

0.08

0.06

0.10

0.12

0.08

BAR

1333

1446

1227

1473

1489

1458

878

862

862

1059

824

1742

815

887

796

1195

1783

552

1900

2123

512

1233

916

2007

2561

2631

0.10

0.12

0.11

0.13

0.13

0.15

0.14

0.14

0.05

0.11

0.12

0.11

0.09

SPM

1360

1450

1227

1478

1555

1448

956

883

883

1160

916

1782

933

1011

919

1136

1873

665

1864

2103

684

1128

1096

1944

2606

2677

211

0.09

0.08

0.02

0.11

0.13

0.12

0.12

0.06

0.06

0.01

0.05

0.04

KER

1490

1443

1655

1417

1301

1484

1913

1960

1960

1762

1973

1062

2017

1964

2042

1901

1073

2250

1327

1075

2391

2127

2197

1517

250

179

2789

2841

0.03

0.10

0.05

0.07

0.05

0.05

0.09

0.04

0.09

0.13

0.07

KRA

1687

1633

1848

1607

1506

1670

2117

2160

2160

1968

2178

1261

2222

2170

2248

2084

1276

2455

1478

1214

2596

2301

2401

1648

438

366

2992

3042

206

0.10

0.01

0.04

0.02

0.01

0.08

0.02

0.09

0.12

0.07

SPC

1601

1698

1474

1725

1781

1698

1173

1124

1124

1363

1125

2020

1122

1196

1104

1387

2086

857

2116

2354

818

1368

1214

2193

2843

2913

310

252

3075

3277

0.12

0.15

0.13

0.14

0.08

0.07

0.01

0.14

0.06

SOC

1748

1678

1898

1651

1591

1706

2197

2224

2224

2060

2262

1327

2313

2264

2339

2113

1388

2537

1470

1198

2689

2316

2512

1618

509

443

3068

3107

325

161

3346

0.06

0.01

0.02

0.10

0.04

0.11

0.15

0.10

TIK

1778

1733

1945

1707

1580

1774

2191

2247

2247

2033

2250

1350

2288

2233

2313

2190

1328

2527

1600

1339

2660

2414

2447

1776

539

467

3069

3126

290

132

3358

253

0.05

0.05

0.12

0.07

0.14

0.17

0.12

GES

1845

1770

1991

1743

1695

1795

2300

2321

2321

2166

2365

1426

2418

2370

2444

2197

1496

2640

1540

1265

2794

2395

2620

1675

614

550

3168

3204

434

257

3445

109

315

0.03

0.11

0.05

0.12

0.15

0.10

RAR

1853

1798

2014

1772

1670

1833

2281

2326

2326

2130

2342

1427

2385

2331

2410

2246

1432

2619

1629

1361

2758

2460

2555

1789

604

532

3157

3208

369

166

3442

187

127

210

0.11

0.05

0.13

0.16

0.11

TUH

1919

1828

2051

1801

1797

1844

2393

2395

2395

2272

2461

1512

2522

2478

2549

2233

1629

2734

1544

1267

2897

2412

2745

1650

734

679

3251

3275

597

443

3519

283

508

194

398

0.07

0.08

0.09

0.05

ARM

2235

2144

2367

2117

2107

2159

2706

2711

2711

2581

2773

1826

2832

2787

2859

2545

1923

3047

1848

1571

3207

2719

3045

1937

1034

974

3566

3591

862

674

3835

538

687

429

560

316

0.06

0.11

0.05

SPB

2158

2263

2040

2290

2313

2266

1702

1683

1683

1869

1644

2569

1617

1682

1595

1957

2594

1367

2687

2925

1264

1923

1601

2760

3388

3457

827

823

3614

3818

571

3895

3893

3995

3983

4076

4392

0.12

0.05

SPS

2144

2226

2002

2253

2341

2215

1737

1669

1669

1930

1690

2568

1689

1761

1670

1865

2651

1424

2601

2855

1371

1785

1750

2643

3393

3464

874

787

3628

3829

567

3892

3913

3988

3995

4054

4370

332

0.09

POP

2197

2320

2103

2347

2316

2335

1711

1734

1734

1852

1645

2591

1597

1653

1572

2065

2566

1370

2777

3001

1225

2067

1509

2876

3397

3464

878

939

3614

3819

702

3906

3885

4010

3981

4104

4417

283

615

2. Mantel test with Europe Set 2

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.1 \
--keep-fam output/fst/pops_4fst.txt \
--make-bed \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log

47484 variants loaded from .bim file. –keep-fam: 407 people remaining. Total genotyping rate in remaining samples is 0.97243. 47484 variants and 407 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel \
--recodeA \
--out output/fst/mantel \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel.log

47484 variants loaded from .bim file. 47484 variants and 407 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "output", "europe", "fst", "europeLD2.rds"
))

Load it

albo2 <- readRDS(here(
  "output", "europe", "fst", "europeLD2.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
head(sampling_loc)
##       Pop_City    Country  Latitude Longitude Continent Abbreviation
## 1  Franceville      Gabon  -1.59207  13.53242    Africa          GAB
## 2 Antananarivo Madagascar -18.87920  47.50790    Africa          ANT
## 3  Diego ville Madagascar -12.27361  49.29372    Africa          DGV
## 4    Morondava Madagascar -20.28420  44.27940    Africa          MAD
## 5     Vohimasy Madagascar -22.81591  47.75026    Africa          VOH
## 6      Dauguet  Mauritius -20.18530  57.52154    Africa          DAU
##          Year         Region    Subregion order
## 1        2015 Central Africa       Africa    72
## 2        2022    East Africa  East Africa    76
## 3        2022    East Africa  East Africa    77
## 4        2016    East Africa  East Africa    78
## 5 2016 & 2017    East Africa  East Africa    79
## 6        2022   Indian Ocean Indian Ocean    80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      SOC         1065          0          0   0        -9    Sochi  Russia
## 2      SOC         1066          0          0   0        -9    Sochi  Russia
## 3      SOC         1067          0          0   0        -9    Sochi  Russia
## 4      SOC         1068          0          0   0        -9    Sochi  Russia
## 5      SOC         1069          0          0   0        -9    Sochi  Russia
## 6      SOC         1070          0          0   0        -9    Sochi  Russia
##   Latitude Longitude Continent Year         Region   Subregion order
## 1 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 2 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 3 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 4 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 5 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 6 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 43.60042 39.74533
## [2,] 43.60042 39.74533
## [3,] 43.60042 39.74533
## [4,] 43.60042 39.74533
## [5,] 43.60042 39.74533
## [6,] 43.60042 39.74533
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 43.60204 39.74689
## [2,] 43.60159 39.74539
## [3,] 43.59905 39.74583
## [4,] 43.59910 39.74705
## [5,] 43.59873 39.74606
## [6,] 43.59864 39.74720

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "output", "europe", "fst", "europeLD2.rds"
  )
)

2.1 Isolation By Distance for Europe (Set 2)

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: -0.0106724 
## 
## Based on 999 replicates
## Simulated p-value: 0.551 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## -0.167827661  0.004410054  0.008076388

Simulated p-value is not statistically signficant. Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance.

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "simIBD_europe_LD2.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "Genetic_v_Geog_distance_europe_LD2_1.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

dev.off()

Add the equation

# Plotting the data
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
# You can adjust the position (e.g., x and y values) as necessary
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "Genetic_v_Geog_distance_europe_LD2.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
# You can adjust the position (e.g., x and y values) as necessary
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)
dev.off()

Use library MASS for plot

library(MASS)
CairoPDF(here(
     "output", "europe", "fst", "IDB_PlotFromMASS_density_1.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()
png(here("output", "europe", "fst", "ibd2_1.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Check the populations - I did not include those with less than 4 mosquitoes

summary(albo2$pop)
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ITB ITP ITR KER KRA 
##  10  12  12  10  12  13  10  14  12  16  12  12  11  10   4   5   8  12  12  12 
## MAL POP RAR ROM ROS SER SEV SIC SLO SOC SPB SPC SPM SPS STS TIK TIR TRE TUA TUH 
##  12  12  12   4  11   4  12   9  12  12   8   6   5   8  12  12   4  12   9  12

3. Estimate Fst for Europe populations with at least 4 individuals using r<0.01 SNP set (Set 1)

Create list of populations

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

awk '{print $1}' output/snps_sets/r2_0.01.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4fst.txt;
head  output/fst/pops_4fst.txt;
wc -l output/fst/pops_4fst.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## 40 output/fst/pops_4fst.txt

We have 40 populations with 4 or more mosquitoes.

3.1 Convert to raw format and subset the bed file

First load plink

module load PLINK/1.9b_6.21-x86_64
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01 \
--keep-fam output/fst/pops_4fst.txt \
--recodeA \
--out output/fst/r2_0.01 \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.01.log

17028 variants loaded from .bim file. –keep-fam: 407 people remaining. Total genotyping rate in remaining samples is 0.970158. 17028 variants and 407 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

LD2 <-
  read.PLINK(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r2_0.01.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )
## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
summary(LD2)
##   Length    Class     Mode 
##        1 genlight       S4

3.2 Convert the genlight object to Stampp format, and estimate pairwide Fst values

The Europe+Asia dataset kept crashing R Studio when running the next step, so we will try running it directly in the R module in McCleary Install StAMPP for direct use in the R module in the cluster

salloc
module load R/4.2.0-foss-2020b
R
#install.packages("StAMPP", repos="http://cran.r-project.org")

srun -p ycga --pty -N 1 -n 1 -c 4 bash
#salloc -p ycga -t 2:00:00 -c 5
#srun -p ycga --pty bash #start an interactive session and load R packages needed
R
library(StAMPP)
library(adegenet)
library(here)

LD2 <-
  read.PLINK(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r2_0.01.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

summary(LD2)

#The command below would also work, but you can simplify it and put only the numbers: genome_equal_2 <- stamppFst(neutral, nboots=100, percent=95 + nclusters==10)

#This chunk will take a couple minutes to run.

LD2_2 <- stamppConvert(LD2, type="genlight")

LD2_3 <- stamppFst(LD2_2, 1, 95, 1)

saveRDS(
  LD2_3, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r_01/LD2_01.rds"
  )
)

To load it

LD2_3 <- readRDS(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r_01/LD2_01.rds"
  )
)

Now lets look at the object

summary(LD2_3)
##       SOC                SEV               GES               KER         
##  Min.   :0.007311   Min.   :0.02992   Min.   :0.01736   Min.   :0.03118  
##  1st Qu.:0.096319   1st Qu.:0.11754   1st Qu.:0.10369   1st Qu.:0.07739  
##  Median :0.111864   Median :0.13548   Median :0.12121   Median :0.09315  
##  Mean   :0.107863   Mean   :0.13222   Mean   :0.11844   Mean   :0.09748  
##  3rd Qu.:0.128105   3rd Qu.:0.15325   3rd Qu.:0.13510   3rd Qu.:0.10958  
##  Max.   :0.225734   Max.   :0.24709   Max.   :0.23191   Max.   :0.20274  
##  NA's   :1          NA's   :2         NA's   :3         NA's   :4        
##       KRA               TIK               RAR               TRE         
##  Min.   :0.01111   Min.   :0.05284   Min.   :0.05298   Min.   :0.02280  
##  1st Qu.:0.07491   1st Qu.:0.12123   1st Qu.:0.11076   1st Qu.:0.03519  
##  Median :0.09108   Median :0.13647   Median :0.12539   Median :0.05345  
##  Mean   :0.09209   Mean   :0.13920   Mean   :0.13079   Mean   :0.05612  
##  3rd Qu.:0.10552   3rd Qu.:0.15373   3rd Qu.:0.14289   3rd Qu.:0.06377  
##  Max.   :0.19543   Max.   :0.24869   Max.   :0.23378   Max.   :0.15172  
##  NA's   :5         NA's   :6         NA's   :7         NA's   :8        
##       ALU               STS               SIC               BRE         
##  Min.   :0.06427   Min.   :0.02997   Min.   :0.04444   Min.   :0.08688  
##  1st Qu.:0.07946   1st Qu.:0.05088   1st Qu.:0.06078   1st Qu.:0.12055  
##  Median :0.09406   Median :0.06915   Median :0.08174   Median :0.13740  
##  Mean   :0.10162   Mean   :0.07324   Mean   :0.08448   Mean   :0.14092  
##  3rd Qu.:0.10823   3rd Qu.:0.07785   3rd Qu.:0.09264   3rd Qu.:0.15603  
##  Max.   :0.19589   Max.   :0.17344   Max.   :0.19149   Max.   :0.24786  
##  NA's   :9         NA's   :10        NA's   :11        NA's   :12       
##       DES               CES              TIR                 IMP         
##  Min.   :0.07757   Min.   :0.1314   Min.   :-0.000645   Min.   :0.04306  
##  1st Qu.:0.10679   1st Qu.:0.1518   1st Qu.: 0.063331   1st Qu.:0.05707  
##  Median :0.12593   Median :0.1746   Median : 0.073437   Median :0.08053  
##  Mean   :0.12750   Mean   :0.1751   Mean   : 0.082908   Mean   :0.08087  
##  3rd Qu.:0.14060   3rd Qu.:0.1875   3rd Qu.: 0.100416   3rd Qu.:0.08657  
##  Max.   :0.22813   Max.   :0.2837   Max.   : 0.221505   Max.   :0.22494  
##  NA's   :13        NA's   :14       NA's   :15          NA's   :16       
##       ROM                GRC               BAR               BUL         
##  Min.   :0.005747   Min.   :0.02423   Min.   :0.04388   Min.   :0.05313  
##  1st Qu.:0.044609   1st Qu.:0.06052   1st Qu.:0.08151   1st Qu.:0.06765  
##  Median :0.065980   Median :0.07806   Median :0.09771   Median :0.08864  
##  Mean   :0.066735   Mean   :0.08185   Mean   :0.10234   Mean   :0.09014  
##  3rd Qu.:0.080996   3rd Qu.:0.09858   3rd Qu.:0.12127   3rd Qu.:0.09997  
##  Max.   :0.206208   Max.   :0.18863   Max.   :0.21578   Max.   :0.19385  
##  NA's   :17         NA's   :18        NA's   :19        NA's   :20       
##       CRO               GRA               ITB               MAL         
##  Min.   :0.02783   Min.   :0.04929   Min.   :0.05313   Min.   :0.01794  
##  1st Qu.:0.05626   1st Qu.:0.05828   1st Qu.:0.07026   1st Qu.:0.03869  
##  Median :0.06515   Median :0.07547   Median :0.08330   Median :0.05604  
##  Mean   :0.07561   Mean   :0.08066   Mean   :0.09315   Mean   :0.05880  
##  3rd Qu.:0.09137   3rd Qu.:0.09714   3rd Qu.:0.10416   3rd Qu.:0.07361  
##  Max.   :0.17206   Max.   :0.18532   Max.   :0.22078   Max.   :0.16004  
##  NA's   :21        NA's   :22        NA's   :23        NA's   :24       
##       SPM                TUA               TUH               ALD         
##  Min.   :0.006871   Min.   :0.02829   Min.   :0.03447   Min.   :0.02244  
##  1st Qu.:0.045630   1st Qu.:0.05807   1st Qu.:0.04586   1st Qu.:0.08179  
##  Median :0.056524   Median :0.07420   Median :0.06887   Median :0.09939  
##  Mean   :0.066728   Mean   :0.08066   Mean   :0.07247   Mean   :0.10639  
##  3rd Qu.:0.081846   3rd Qu.:0.09925   3rd Qu.:0.08466   3rd Qu.:0.12546  
##  Max.   :0.209145   Max.   :0.19709   Max.   :0.17094   Max.   :0.21518  
##  NA's   :25         NA's   :26        NA's   :27        NA's   :28       
##       FRS               ITP               POP               ROS         
##  Min.   :0.03904   Min.   :0.02335   Min.   :0.03484   Min.   :0.05978  
##  1st Qu.:0.04250   1st Qu.:0.04222   1st Qu.:0.05135   1st Qu.:0.08975  
##  Median :0.06191   Median :0.06744   Median :0.05485   Median :0.09692  
##  Mean   :0.06874   Mean   :0.07105   Mean   :0.07033   Mean   :0.10801  
##  3rd Qu.:0.07393   3rd Qu.:0.07974   3rd Qu.:0.07401   3rd Qu.:0.11011  
##  Max.   :0.16317   Max.   :0.17352   Max.   :0.16544   Max.   :0.20168  
##  NA's   :29        NA's   :30        NA's   :31        NA's   :32       
##       SER              SLO               SPC               SPB         
##  Min.   :0.1519   Min.   :0.04098   Min.   :0.00762   Min.   :0.05969  
##  1st Qu.:0.1729   1st Qu.:0.05351   1st Qu.:0.06937   1st Qu.:0.07454  
##  Median :0.1879   Median :0.05713   Median :0.08597   Median :0.08747  
##  Mean   :0.1915   Mean   :0.06075   Mean   :0.08068   Mean   :0.09051  
##  3rd Qu.:0.2094   3rd Qu.:0.06211   3rd Qu.:0.10543   3rd Qu.:0.10344  
##  Max.   :0.2363   Max.   :0.09319   Max.   :0.13499   Max.   :0.12741  
##  NA's   :33       NA's   :34        NA's   :35        NA's   :36       
##       SPS               ARM               ALV               ITR     
##  Min.   :0.07167   Min.   :0.06895   Min.   :0.05727   Min.   : NA  
##  1st Qu.:0.09287   1st Qu.:0.07346   1st Qu.:0.05727   1st Qu.: NA  
##  Median :0.11407   Median :0.07797   Median :0.05727   Median : NA  
##  Mean   :0.10048   Mean   :0.07797   Mean   :0.05727   Mean   :NaN  
##  3rd Qu.:0.11488   3rd Qu.:0.08248   3rd Qu.:0.05727   3rd Qu.: NA  
##  Max.   :0.11569   Max.   :0.08699   Max.   :0.05727   Max.   : NA  
##  NA's   :37        NA's   :38        NA's   :39        NA's   :40

If you want you can save the fst values as csv.

# Convert to data frame
LD2_df <- data.frame(LD2_3)
# Save it
write.csv(LD2_df, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r_01/LD2_df_01.csv"))

Check the Fst values

head(LD2_df)
##             SOC        SEV        GES        KER        KRA TIK RAR TRE ALU STS
## SOC          NA         NA         NA         NA         NA  NA  NA  NA  NA  NA
## SEV 0.045340580         NA         NA         NA         NA  NA  NA  NA  NA  NA
## GES 0.011051225 0.05398193         NA         NA         NA  NA  NA  NA  NA  NA
## KER 0.046624742 0.06797261 0.05431325         NA         NA  NA  NA  NA  NA  NA
## KRA 0.007310457 0.02992473 0.01736366 0.03118331         NA  NA  NA  NA  NA  NA
## TIK 0.058308330 0.07951033 0.05326679 0.07204451 0.03550992  NA  NA  NA  NA  NA
##     SIC BRE DES CES TIR IMP ROM GRC BAR BUL CRO GRA ITB MAL SPM TUA TUH ALD FRS
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## SEV  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KER  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KRA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TIK  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
##     ITP POP ROS SER SLO SPC SPB SPS ARM ALV ITR
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## SEV  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KER  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KRA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TIK  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Convert the data into a matrix.

aa <- as.matrix(LD2_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##             SOC        SEV        GES        KER         KRA        TIK
## SOC          NA 0.04534058 0.01105123 0.04662474 0.007310457 0.05830833
## SEV 0.045340580         NA 0.05398193 0.06797261 0.029924734 0.07951033
## GES 0.011051225 0.05398193         NA 0.05431325 0.017363660 0.05326679
## KER 0.046624742 0.06797261 0.05431325         NA 0.031183311 0.07204451
## KRA 0.007310457 0.02992473 0.01736366 0.03118331          NA 0.03550992
## TIK 0.058308330 0.07951033 0.05326679 0.07204451 0.035509919         NA
##            RAR        TRE        ALU        STS        SIC       BRE       DES
## SOC 0.02140308 0.09774556 0.06956851 0.10725685 0.11842581 0.1778315 0.1664752
## SEV 0.05274907 0.11676360 0.09533467 0.12805521 0.14109059 0.1957159 0.1853356
## GES 0.02845986 0.10228847 0.08536454 0.11152269 0.12403531 0.1804918 0.1701758
## KER 0.05335175 0.07223555 0.06478502 0.08210551 0.09791612 0.1522506 0.1390027
## KRA 0.01110483 0.07237471 0.05408914 0.08351473 0.09244042 0.1509169 0.1405311
## TIK 0.05283903 0.11486839 0.09035828 0.12571084 0.13863956 0.1938497 0.1815638
##           CES       TIR        IMP        ROM        GRC       BAR       BUL
## SOC 0.2084034 0.1279906 0.12014266 0.10598720 0.11737403 0.1362259 0.1282197
## SEV 0.2266013 0.1533911 0.14660265 0.13150060 0.13483143 0.1584720 0.1494195
## GES 0.2108559 0.1350974 0.12700509 0.11104200 0.12349279 0.1437363 0.1344213
## KER 0.1801340 0.1099683 0.09105621 0.08616772 0.10043376 0.1237754 0.1084714
## KRA 0.1811980 0.1019753 0.09219167 0.08079209 0.09064014 0.1118393 0.1028093
## TIK 0.2231552 0.1547583 0.14573453 0.13191149 0.13556816 0.1551659 0.1486248
##            CRO        GRA       ITB        MAL        SPM        TUA        TUH
## SOC 0.11548159 0.10812974 0.1344973 0.09509352 0.11186385 0.11843156 0.10440110
## SEV 0.13612178 0.12816215 0.1586964 0.11731416 0.13942614 0.13961729 0.12321527
## GES 0.12130010 0.11394608 0.1405141 0.10187843 0.12078634 0.12484530 0.11211883
## KER 0.09946183 0.09385646 0.1094535 0.07532346 0.08876307 0.10298389 0.08472508
## KRA 0.09226819 0.08358153 0.1082221 0.07264223 0.08298896 0.09156695 0.07825535
## TIK 0.13369582 0.12732836 0.1581451 0.11588489 0.13491594 0.13768571 0.12231274
##           ALD        FRS        ITP        POP       ROS       SER        SLO
## SOC 0.1400475 0.10249166 0.10147814 0.09916795 0.1343887 0.2257335 0.09200842
## SEV 0.1604408 0.12125322 0.12308452 0.12072078 0.1550944 0.2470881 0.11079811
## GES 0.1455015 0.10774189 0.10719453 0.10568851 0.1398834 0.2319092 0.09786065
## KER 0.1249852 0.07807626 0.08041833 0.07279555 0.1105760 0.2027373 0.07278478
## KRA 0.1144149 0.07603818 0.07526203 0.07455165 0.1082726 0.1954302 0.06545358
## TIK 0.1606478 0.12086388 0.12263965 0.11939275 0.1506513 0.2486889 0.10862989
##            SPC        SPB       SPS        ARM        ALV        ITR
## SOC 0.12467311 0.11528409 0.1505581 0.04473196 0.11895717 0.09754439
## SEV 0.15283101 0.14107609 0.1730141 0.07087941 0.14016159 0.11822902
## GES 0.13281801 0.12120594 0.1588711 0.05626477 0.12520599 0.10368592
## KER 0.10120180 0.09243961 0.1323221 0.03946515 0.10268353 0.07931754
## KRA 0.09829134 0.09108052 0.1251237 0.01848469 0.09600472 0.07323132
## TIK 0.14678514 0.13736995 0.1708804 0.06769401 0.13827773 0.11770930

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))


# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    order
  )

# Check it
head(sampling_loc)
##               Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1 Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2           Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3             Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4                Loule Portugal 37.09084 -8.092465    Europe          POL 2017
## 5              Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 6            San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
##            Region   Subregion order
## 1  Western Europe West Europe     1
## 2  Western Europe West Europe     2
## 3 Southern Europe West Europe     3
## 4 Southern Europe West Europe     4
## 5 Southern Europe West Europe     5
## 6 Southern Europe West Europe     6

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##   [1] "FRS" "STS" "POP" "POL" "SPB" "SPS" "SPC" "BAR" "SPM" "IMP" "ITG" "BRE"
##  [13] "DES" "TRE" "ITB" "CES" "ROM" "ITR" "SIC" "ITP" "MAL" "SLO" "CRO" "ALV"
##  [25] "ALD" "TIR" "SER" "GRA" "GRC" "ROS" "BUL" "TUA" "TUH" "SEV" "ALU" "KER"
##  [37] "KRA" "SOC" "TIK" "RAR" "GES" "ARM" "KAN" "UTS" "KAG" "OKI" "HAI" "YUN"
##  [49] "HUN" "TAI" "GEL" "BEN" "KUN" "KAT" "JAF" "CAM" "SUF" "SUU" "INW" "INJ"
##  [61] "KLP" "MAT" "SSK" "KAC" "SON" "CHA" "LAM" "HAN" "HOC" "QNC" "RAB" "GAB"
##  [73] "LIB" "YAO" "AWK" "ANT" "DGV" "MAD" "VOH" "DAU" "TRO" "BRM" "JAM" "SAI"
##  [85] "BEA" "BER" "CHI" "COL" "DAL" "FAY" "HOU" "LOS" "MAC" "MAN" "NEO" "NEW"
##  [97] "NUE" "PAL" "PEO" "RUS" "SPR" "POR" "GRV" "MAU" "NOV" "REC" "TUC" "MED"
## [109] "AIZ" "HIR" "KHO" "KYO" "NAG" "NIG" "SAG" "SAK" "SEN" "TAN" "JAT" "YAT"

Create vector with order of populations

# Extract the populations that appear in LD2_df
populations_in_LD2 <- colnames(LD2_df)

# Reorder the populations based on order_pops
poporder <- populations_in_LD2[populations_in_LD2 %in% order_pops]

#LD2_df[match(poporder, LD2_df$Abbreviation),] #this also doesn't reorder it

# Print the reordered populations
print(poporder)
##  [1] "SOC" "SEV" "GES" "KER" "KRA" "TIK" "RAR" "TRE" "ALU" "STS" "SIC" "BRE"
## [13] "DES" "CES" "TIR" "IMP" "ROM" "GRC" "BAR" "BUL" "CRO" "GRA" "ITB" "MAL"
## [25] "SPM" "TUA" "TUH" "ALD" "FRS" "ITP" "POP" "ROS" "SER" "SLO" "SPC" "SPB"
## [37] "SPS" "ARM" "ALV" "ITR"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1           Var2          value        
##  SOC    :  40   SOC    :  40   Min.   :-0.0006  
##  SEV    :  40   SEV    :  40   1st Qu.: 0.0660  
##  GES    :  40   GES    :  40   Median : 0.0936  
##  KER    :  40   KER    :  40   Mean   : 0.0999  
##  KRA    :  40   KRA    :  40   3rd Qu.: 0.1258  
##  TIK    :  40   TIK    :  40   Max.   : 0.2837  
##  (Other):1360   (Other):1360   NA's   :820

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 2) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

Save it

ggsave(
  filename = here("output", "europe", "fst", "r_01", "fst_matrix_europe_r01_LD2.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

3.3 Fst by country for European Set 1 dataset

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(LD2_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##             Albania    Armenia   Bulgaria    Croatia     France    Georgia
## Albania          NA 0.09321744 0.10071586 0.02902128 0.08171049 0.13526830
## Armenia  0.09321744         NA 0.09453274 0.08402086 0.06488566 0.05626477
## Bulgaria 0.10071586 0.09453274         NA 0.08659803 0.07492555 0.13442128
## Croatia  0.02902128 0.08402086 0.08659803         NA 0.06325525 0.12130010
## France   0.08171049 0.06488566 0.07492555 0.06325525         NA 0.10963229
## Georgia  0.13526830 0.05626477 0.13442128 0.12130010 0.10963229         NA
## Greece   0.06256676 0.08098370 0.09271700 0.05530008 0.07048226 0.11871943
## Italy    0.10031787 0.08125198 0.09058315 0.08446013 0.07069720 0.13772888
## Malta    0.06943936 0.06208794 0.05647669 0.05515396 0.03900198 0.10187843
## Portugal 0.07568869 0.05432709 0.06598036 0.06468746 0.04262535 0.10568851
## Romania  0.11177139 0.09437558 0.10276302 0.10008786 0.07513883 0.13988343
## Russia   0.13048892 0.04597386 0.12935100 0.11613403 0.10534007 0.02204067
## Serbia   0.20037488 0.18791125 0.19385358 0.17206143 0.16830584 0.23190916
## Slovenia 0.06605789 0.05327576 0.06528185 0.05737246 0.04394329 0.09786065
## Spain    0.10936323 0.08179381 0.10542292 0.09378821 0.07178449 0.13548354
## Turkey   0.07977018 0.07764824 0.07567729 0.07033617 0.06034639 0.11848206
## Ukraine  0.12284139 0.05820559 0.12083319 0.10949639 0.09577193 0.06191042
##              Greece      Italy      Malta   Portugal    Romania     Russia
## Albania  0.06256676 0.10031787 0.06943936 0.07568869 0.11177139 0.13048892
## Armenia  0.08098370 0.08125198 0.06208794 0.05432709 0.09437558 0.04597386
## Bulgaria 0.09271700 0.09058315 0.05647669 0.06598036 0.10276302 0.12935100
## Croatia  0.05530008 0.08446013 0.05515396 0.06468746 0.10008786 0.11613403
## France   0.07048226 0.07069720 0.03900198 0.04262535 0.07513883 0.10534007
## Georgia  0.11871943 0.13772888 0.10187843 0.10568851 0.13988343 0.02204067
## Greece           NA 0.08950131 0.05519872 0.06808207 0.10204372 0.11265341
## Italy    0.08950131         NA 0.04627918 0.04934449 0.09222338 0.13368546
## Malta    0.05519872 0.04627918         NA 0.02989389 0.07421546 0.09726957
## Portugal 0.06808207 0.04934449 0.02989389         NA 0.07400939 0.10088585
## Romania  0.10204372 0.09222338 0.07421546 0.07400939         NA 0.13405182
## Russia   0.11265341 0.13368546 0.09726957 0.10088585 0.13405182         NA
## Serbia   0.18697367 0.18879959 0.16004224 0.16544015 0.20168208 0.22590842
## Slovenia 0.05673457 0.05219771 0.03919840 0.03612759 0.05977806 0.09144282
## Spain    0.09397200 0.09029722 0.07212531 0.06579523 0.10718238 0.12934461
## Turkey   0.07171085 0.07023434 0.04526383 0.05569789 0.09184709 0.11147059
## Ukraine  0.10485926 0.09974544 0.08771344 0.09057183 0.12625042 0.05832179
##             Serbia   Slovenia      Spain     Turkey    Ukraine
## Albania  0.2003749 0.06605789 0.10936323 0.07977018 0.12284139
## Armenia  0.1879113 0.05327576 0.08179381 0.07764824 0.05820559
## Bulgaria 0.1938536 0.06528185 0.10542292 0.07567729 0.12083319
## Croatia  0.1720614 0.05737246 0.09378821 0.07033617 0.10949639
## France   0.1683058 0.04394329 0.07178449 0.06034639 0.09577193
## Georgia  0.2319092 0.09786065 0.13548354 0.11848206 0.06191042
## Greece   0.1869737 0.05673457 0.09397200 0.07171085 0.10485926
## Italy    0.1887996 0.05219771 0.09029722 0.07023434 0.09974544
## Malta    0.1600422 0.03919840 0.07212531 0.04526383 0.08771344
## Portugal 0.1654401 0.03612759 0.06579523 0.05569789 0.09057183
## Romania  0.2016821 0.05977806 0.10718238 0.09184709 0.12625042
## Russia   0.2259084 0.09144282 0.12934461 0.11147059 0.05832179
## Serbia          NA 0.15188779 0.21540658 0.18401508 0.21523980
## Slovenia 0.1518878         NA 0.06515292 0.05208143 0.08333507
## Spain    0.2154066 0.06515292         NA 0.07099299 0.12195514
## Turkey   0.1840151 0.05208143 0.07099299         NA 0.10371389
## Ukraine  0.2152398 0.08333507 0.12195514 0.10371389         NA
symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)
##          Albania    Armenia   Bulgaria    Croatia     France    Georgia
## Albania       NA 0.09321744 0.10071586 0.02902128 0.08171049 0.13526830
## Armenia       NA         NA 0.09453274 0.08402086 0.06488566 0.05626477
## Bulgaria      NA         NA         NA 0.08659803 0.07492555 0.13442128
## Croatia       NA         NA         NA         NA 0.06325525 0.12130010
## France        NA         NA         NA         NA         NA 0.10963229
## Georgia       NA         NA         NA         NA         NA         NA
## Greece        NA         NA         NA         NA         NA         NA
## Italy         NA         NA         NA         NA         NA         NA
## Malta         NA         NA         NA         NA         NA         NA
## Portugal      NA         NA         NA         NA         NA         NA
## Romania       NA         NA         NA         NA         NA         NA
## Russia        NA         NA         NA         NA         NA         NA
## Serbia        NA         NA         NA         NA         NA         NA
## Slovenia      NA         NA         NA         NA         NA         NA
## Spain         NA         NA         NA         NA         NA         NA
## Turkey        NA         NA         NA         NA         NA         NA
## Ukraine       NA         NA         NA         NA         NA         NA
##              Greece      Italy      Malta   Portugal    Romania     Russia
## Albania  0.06256676 0.10031787 0.06943936 0.07568869 0.11177139 0.13048892
## Armenia  0.08098370 0.08125198 0.06208794 0.05432709 0.09437558 0.04597386
## Bulgaria 0.09271700 0.09058315 0.05647669 0.06598036 0.10276302 0.12935100
## Croatia  0.05530008 0.08446013 0.05515396 0.06468746 0.10008786 0.11613403
## France   0.07048226 0.07069720 0.03900198 0.04262535 0.07513883 0.10534007
## Georgia  0.11871943 0.13772888 0.10187843 0.10568851 0.13988343 0.02204067
## Greece           NA 0.08950131 0.05519872 0.06808207 0.10204372 0.11265341
## Italy            NA         NA 0.04627918 0.04934449 0.09222338 0.13368546
## Malta            NA         NA         NA 0.02989389 0.07421546 0.09726957
## Portugal         NA         NA         NA         NA 0.07400939 0.10088585
## Romania          NA         NA         NA         NA         NA 0.13405182
## Russia           NA         NA         NA         NA         NA         NA
## Serbia           NA         NA         NA         NA         NA         NA
## Slovenia         NA         NA         NA         NA         NA         NA
## Spain            NA         NA         NA         NA         NA         NA
## Turkey           NA         NA         NA         NA         NA         NA
## Ukraine          NA         NA         NA         NA         NA         NA
##             Serbia   Slovenia      Spain     Turkey    Ukraine
## Albania  0.2003749 0.06605789 0.10936323 0.07977018 0.12284139
## Armenia  0.1879113 0.05327576 0.08179381 0.07764824 0.05820559
## Bulgaria 0.1938536 0.06528185 0.10542292 0.07567729 0.12083319
## Croatia  0.1720614 0.05737246 0.09378821 0.07033617 0.10949639
## France   0.1683058 0.04394329 0.07178449 0.06034639 0.09577193
## Georgia  0.2319092 0.09786065 0.13548354 0.11848206 0.06191042
## Greece   0.1869737 0.05673457 0.09397200 0.07171085 0.10485926
## Italy    0.1887996 0.05219771 0.09029722 0.07023434 0.09974544
## Malta    0.1600422 0.03919840 0.07212531 0.04526383 0.08771344
## Portugal 0.1654401 0.03612759 0.06579523 0.05569789 0.09057183
## Romania  0.2016821 0.05977806 0.10718238 0.09184709 0.12625042
## Russia   0.2259084 0.09144282 0.12934461 0.11147059 0.05832179
## Serbia          NA 0.15188779 0.21540658 0.18401508 0.21523980
## Slovenia        NA         NA 0.06515292 0.05208143 0.08333507
## Spain           NA         NA         NA 0.07099299 0.12195514
## Turkey          NA         NA         NA         NA 0.10371389
## Ukraine         NA         NA         NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)
##        Var1           Var2         value        
##  Albania : 17   Albania : 17   Min.   :0.02204  
##  Armenia : 17   Armenia : 17   1st Qu.:0.06509  
##  Bulgaria: 17   Bulgaria: 17   Median :0.09043  
##  Croatia : 17   Croatia : 17   Mean   :0.09624  
##  France  : 17   France  : 17   3rd Qu.:0.11352  
##  Georgia : 17   Georgia : 17   Max.   :0.23191  
##  (Other) :187   (Other) :187   NA's   :153

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 2) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "europe", "fst", "r_01", "fst_matrix_europe_r01_LD2_by_country.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 40 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 SOC     0.108 
##  2 SEV     0.130 
##  3 GES     0.114 
##  4 KER     0.0943
##  5 KRA     0.0848
##  6 TIK     0.129 
##  7 RAR     0.116 
##  8 TRE     0.0636
##  9 ALU     0.0964
## 10 STS     0.0787
## # ℹ 30 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Region
  )

# check output
head(fst3)
##               Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1 Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2           Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3             Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4              Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5            San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6            Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
##     Subregion order   mean_fst
## 1 West Europe     1 0.07499194
## 2 West Europe     2 0.07874931
## 3 West Europe     3 0.07119761
## 4 West Europe     5 0.09003042
## 5 West Europe     6 0.12147228
## 6 West Europe     7 0.09952012

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Subregion) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Subregion")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order   mean_fst mean_fst_by_region
## 1  West Europe     1 0.07499194                0.1
## 2  West Europe     2 0.07874931                0.1
## 3  West Europe     3 0.07119761                0.1
## 4  West Europe     5 0.09003042                0.1
## 5  West Europe     6 0.12147228                0.1
## 6  West Europe     7 0.09952012                0.1
## 7  West Europe     8 0.11429940                0.1
## 8  West Europe     9 0.08075756                0.1
## 9  West Europe    10 0.09191574                0.1
## 10 West Europe    12 0.14543917                0.1
## 11 West Europe    13 0.13083838                0.1
## 12 West Europe    14 0.06355925                0.1
## 13 West Europe    15 0.10359028                0.1
## 14 West Europe    16 0.17703688                0.1
## 15 West Europe    17 0.07997899                0.1
## 16 West Europe    18 0.07079655                0.1
## 17 West Europe    19 0.08944254                0.1
## 18 West Europe    20 0.07208743                0.1
## 19 West Europe    21 0.06876414                0.1
## 20 East Europe    22 0.06740086                0.1
## 21 East Europe    23 0.08722078                0.1
## 22 East Europe    24 0.09096560                0.1
## 23 East Europe    25 0.11256707                0.1
## 24 East Europe    26 0.09903932                0.1
## 25 East Europe    27 0.20214043                0.1
## 26 East Europe    28 0.08828169                0.1
## 27 East Europe    29 0.09295555                0.1
## 28 East Europe    30 0.10766099                0.1
## 29 East Europe    31 0.10181071                0.1
## 30 East Europe    32 0.09222311                0.1
## 31 East Europe    33 0.07798655                0.1
## 32 East Europe    34 0.12999533                0.1
## 33 East Europe    35 0.09637019                0.1
## 34 East Europe    36 0.09431187                0.1
## 35 East Europe    37 0.08484293                0.1
## 36 East Europe    38 0.10786279                0.1
## 37 East Europe    39 0.12901506                0.1
## 38 East Europe    40 0.11630266                0.1
## 39 East Europe    41 0.11403042                0.1
## 40 East Europe    42 0.07943892                0.1

Mean By country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order   mean_fst mean_fst_by_region mean_fst_by_country
## 1  West Europe     1 0.07499194                0.1                0.08
## 2  West Europe     2 0.07874931                0.1                0.08
## 3  West Europe     3 0.07119761                0.1                0.07
## 4  West Europe     5 0.09003042                0.1                0.10
## 5  West Europe     6 0.12147228                0.1                0.10
## 6  West Europe     7 0.09952012                0.1                0.10
## 7  West Europe     8 0.11429940                0.1                0.10
## 8  West Europe     9 0.08075756                0.1                0.10
## 9  West Europe    10 0.09191574                0.1                0.10
## 10 West Europe    12 0.14543917                0.1                0.10
## 11 West Europe    13 0.13083838                0.1                0.10
## 12 West Europe    14 0.06355925                0.1                0.10
## 13 West Europe    15 0.10359028                0.1                0.10
## 14 West Europe    16 0.17703688                0.1                0.10
## 15 West Europe    17 0.07997899                0.1                0.10
## 16 West Europe    18 0.07079655                0.1                0.10
## 17 West Europe    19 0.08944254                0.1                0.10
## 18 West Europe    20 0.07208743                0.1                0.10
## 19 West Europe    21 0.06876414                0.1                0.07
## 20 East Europe    22 0.06740086                0.1                0.07
## 21 East Europe    23 0.08722078                0.1                0.09
## 22 East Europe    24 0.09096560                0.1                0.10
## 23 East Europe    25 0.11256707                0.1                0.10
## 24 East Europe    26 0.09903932                0.1                0.10
## 25 East Europe    27 0.20214043                0.1                0.20
## 26 East Europe    28 0.08828169                0.1                0.09
## 27 East Europe    29 0.09295555                0.1                0.09
## 28 East Europe    30 0.10766099                0.1                0.11
## 29 East Europe    31 0.10181071                0.1                0.10
## 30 East Europe    32 0.09222311                0.1                0.09
## 31 East Europe    33 0.07798655                0.1                0.09
## 32 East Europe    34 0.12999533                0.1                0.11
## 33 East Europe    35 0.09637019                0.1                0.11
## 34 East Europe    36 0.09431187                0.1                0.11
## 35 East Europe    37 0.08484293                0.1                0.11
## 36 East Europe    38 0.10786279                0.1                0.11
## 37 East Europe    39 0.12901506                0.1                0.11
## 38 East Europe    40 0.11630266                0.1                0.11
## 39 East Europe    41 0.11403042                0.1                0.11
## 40 East Europe    42 0.07943892                0.1                0.08

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 40, "Above 40N", "Below 40N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City)

# Print the modified fst3 tibble
print(fst3)
##                    City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order   mean_fst mean_fst_by_region mean_fst_by_country
## 1  West Europe     1 0.07499194                0.1                0.08
## 2  West Europe     2 0.07874931                0.1                0.08
## 3  West Europe     3 0.07119761                0.1                0.07
## 4  West Europe     5 0.09003042                0.1                0.10
## 5  West Europe     6 0.12147228                0.1                0.10
## 6  West Europe     7 0.09952012                0.1                0.10
## 7  West Europe     8 0.11429940                0.1                0.10
## 8  West Europe     9 0.08075756                0.1                0.10
## 9  West Europe    10 0.09191574                0.1                0.10
## 10 West Europe    12 0.14543917                0.1                0.10
## 11 West Europe    13 0.13083838                0.1                0.10
## 12 West Europe    14 0.06355925                0.1                0.10
## 13 West Europe    15 0.10359028                0.1                0.10
## 14 West Europe    16 0.17703688                0.1                0.10
## 15 West Europe    17 0.07997899                0.1                0.10
## 16 West Europe    18 0.07079655                0.1                0.10
## 17 West Europe    19 0.08944254                0.1                0.10
## 18 West Europe    20 0.07208743                0.1                0.10
## 19 West Europe    21 0.06876414                0.1                0.07
## 20 East Europe    22 0.06740086                0.1                0.07
## 21 East Europe    23 0.08722078                0.1                0.09
## 22 East Europe    24 0.09096560                0.1                0.10
## 23 East Europe    25 0.11256707                0.1                0.10
## 24 East Europe    26 0.09903932                0.1                0.10
## 25 East Europe    27 0.20214043                0.1                0.20
## 26 East Europe    28 0.08828169                0.1                0.09
## 27 East Europe    29 0.09295555                0.1                0.09
## 28 East Europe    30 0.10766099                0.1                0.11
## 29 East Europe    31 0.10181071                0.1                0.10
## 30 East Europe    32 0.09222311                0.1                0.09
## 31 East Europe    33 0.07798655                0.1                0.09
## 32 East Europe    34 0.12999533                0.1                0.11
## 33 East Europe    35 0.09637019                0.1                0.11
## 34 East Europe    36 0.09431187                0.1                0.11
## 35 East Europe    37 0.08484293                0.1                0.11
## 36 East Europe    38 0.10786279                0.1                0.11
## 37 East Europe    39 0.12901506                0.1                0.11
## 38 East Europe    40 0.11630266                0.1                0.11
## 39 East Europe    41 0.11403042                0.1                0.11
## 40 East Europe    42 0.07943892                0.1                0.08
##    Latitude_group mean_fst_by_latitude
## 1       Above 40N           0.10236914
## 2       Above 40N           0.10236914
## 3       Above 40N           0.10236914
## 4       Below 40N           0.09149416
## 5       Below 40N           0.09149416
## 6       Below 40N           0.09149416
## 7       Above 40N           0.10236914
## 8       Below 40N           0.09149416
## 9       Above 40N           0.10236914
## 10      Above 40N           0.10236914
## 11      Above 40N           0.10236914
## 12      Above 40N           0.10236914
## 13      Above 40N           0.10236914
## 14      Above 40N           0.10236914
## 15      Above 40N           0.10236914
## 16      Above 40N           0.10236914
## 17      Below 40N           0.09149416
## 18      Above 40N           0.10236914
## 19      Below 40N           0.09149416
## 20      Above 40N           0.10236914
## 21      Above 40N           0.10236914
## 22      Above 40N           0.10236914
## 23      Above 40N           0.10236914
## 24      Above 40N           0.10236914
## 25      Above 40N           0.10236914
## 26      Below 40N           0.09149416
## 27      Below 40N           0.09149416
## 28      Above 40N           0.10236914
## 29      Above 40N           0.10236914
## 30      Below 40N           0.09149416
## 31      Above 40N           0.10236914
## 32      Above 40N           0.10236914
## 33      Above 40N           0.10236914
## 34      Above 40N           0.10236914
## 35      Above 40N           0.10236914
## 36      Above 40N           0.10236914
## 37      Above 40N           0.10236914
## 38      Above 40N           0.10236914
## 39      Above 40N           0.10236914
## 40      Above 40N           0.10236914

Mean By continent

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Continent) |>
  summarize(mean_fst_by_continent = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Continent")

# Print the modified fst3 tibble
print(fst3)
##                    City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##      Subregion order   mean_fst mean_fst_by_region mean_fst_by_country
## 1  West Europe     1 0.07499194                0.1                0.08
## 2  West Europe     2 0.07874931                0.1                0.08
## 3  West Europe     3 0.07119761                0.1                0.07
## 4  West Europe     5 0.09003042                0.1                0.10
## 5  West Europe     6 0.12147228                0.1                0.10
## 6  West Europe     7 0.09952012                0.1                0.10
## 7  West Europe     8 0.11429940                0.1                0.10
## 8  West Europe     9 0.08075756                0.1                0.10
## 9  West Europe    10 0.09191574                0.1                0.10
## 10 West Europe    12 0.14543917                0.1                0.10
## 11 West Europe    13 0.13083838                0.1                0.10
## 12 West Europe    14 0.06355925                0.1                0.10
## 13 West Europe    15 0.10359028                0.1                0.10
## 14 West Europe    16 0.17703688                0.1                0.10
## 15 West Europe    17 0.07997899                0.1                0.10
## 16 West Europe    18 0.07079655                0.1                0.10
## 17 West Europe    19 0.08944254                0.1                0.10
## 18 West Europe    20 0.07208743                0.1                0.10
## 19 West Europe    21 0.06876414                0.1                0.07
## 20 East Europe    22 0.06740086                0.1                0.07
## 21 East Europe    23 0.08722078                0.1                0.09
## 22 East Europe    24 0.09096560                0.1                0.10
## 23 East Europe    25 0.11256707                0.1                0.10
## 24 East Europe    26 0.09903932                0.1                0.10
## 25 East Europe    27 0.20214043                0.1                0.20
## 26 East Europe    28 0.08828169                0.1                0.09
## 27 East Europe    29 0.09295555                0.1                0.09
## 28 East Europe    30 0.10766099                0.1                0.11
## 29 East Europe    31 0.10181071                0.1                0.10
## 30 East Europe    32 0.09222311                0.1                0.09
## 31 East Europe    33 0.07798655                0.1                0.09
## 32 East Europe    34 0.12999533                0.1                0.11
## 33 East Europe    35 0.09637019                0.1                0.11
## 34 East Europe    36 0.09431187                0.1                0.11
## 35 East Europe    37 0.08484293                0.1                0.11
## 36 East Europe    38 0.10786279                0.1                0.11
## 37 East Europe    39 0.12901506                0.1                0.11
## 38 East Europe    40 0.11630266                0.1                0.11
## 39 East Europe    41 0.11403042                0.1                0.11
## 40 East Europe    42 0.07943892                0.1                0.08
##    Latitude_group mean_fst_by_latitude mean_fst_by_continent
## 1       Above 40N           0.10236914                   0.1
## 2       Above 40N           0.10236914                   0.1
## 3       Above 40N           0.10236914                   0.1
## 4       Below 40N           0.09149416                   0.1
## 5       Below 40N           0.09149416                   0.1
## 6       Below 40N           0.09149416                   0.1
## 7       Above 40N           0.10236914                   0.1
## 8       Below 40N           0.09149416                   0.1
## 9       Above 40N           0.10236914                   0.1
## 10      Above 40N           0.10236914                   0.1
## 11      Above 40N           0.10236914                   0.1
## 12      Above 40N           0.10236914                   0.1
## 13      Above 40N           0.10236914                   0.1
## 14      Above 40N           0.10236914                   0.1
## 15      Above 40N           0.10236914                   0.1
## 16      Above 40N           0.10236914                   0.1
## 17      Below 40N           0.09149416                   0.1
## 18      Above 40N           0.10236914                   0.1
## 19      Below 40N           0.09149416                   0.1
## 20      Above 40N           0.10236914                   0.1
## 21      Above 40N           0.10236914                   0.1
## 22      Above 40N           0.10236914                   0.1
## 23      Above 40N           0.10236914                   0.1
## 24      Above 40N           0.10236914                   0.1
## 25      Above 40N           0.10236914                   0.1
## 26      Below 40N           0.09149416                   0.1
## 27      Below 40N           0.09149416                   0.1
## 28      Above 40N           0.10236914                   0.1
## 29      Above 40N           0.10236914                   0.1
## 30      Below 40N           0.09149416                   0.1
## 31      Above 40N           0.10236914                   0.1
## 32      Above 40N           0.10236914                   0.1
## 33      Above 40N           0.10236914                   0.1
## 34      Above 40N           0.10236914                   0.1
## 35      Above 40N           0.10236914                   0.1
## 36      Above 40N           0.10236914                   0.1
## 37      Above 40N           0.10236914                   0.1
## 38      Above 40N           0.10236914                   0.1
## 39      Above 40N           0.10236914                   0.1
## 40      Above 40N           0.10236914                   0.1
fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Subregion, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Subregion, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
##   Latitude_group mean_fst_by_latitude   Subregion mean_fst_by_region  Country
## 1      Above 40N                  0.1 East Europe                0.1  Albania
## 2      Above 40N                  0.1 East Europe                0.1  Albania
## 3      Above 40N                  0.1 East Europe                0.1  Albania
## 4      Above 40N                  0.1 East Europe                0.1  Armenia
## 5      Above 40N                  0.1 East Europe                0.1 Bulgaria
## 6      Above 40N                  0.1 East Europe                0.1  Croatia
##   mean_fst_by_country      City Abbreviation mean_fst
## 1                0.10    Durres          ALD     0.11
## 2                0.10    Tirana          TIR     0.10
## 3                0.10     Vlore          ALV     0.09
## 4                0.08    Ijevan          ARM     0.08
## 5                0.10       Lom          BUL     0.10
## 6                0.09 Dubrovnik          CRO     0.09
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values for Europe using LD2 SNPs.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values for Europe using LD2 SNPs.

Latitude_group

mean_fst_by_latitude

Subregion

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Above 40N

0.10

East Europe

0.1

Albania

0.10

Durres

ALD

0.11

Above 40N

0.10

East Europe

0.1

Albania

0.10

Tirana

TIR

0.10

Above 40N

0.10

East Europe

0.1

Albania

0.10

Vlore

ALV

0.09

Above 40N

0.10

East Europe

0.1

Armenia

0.08

Ijevan

ARM

0.08

Above 40N

0.10

East Europe

0.1

Bulgaria

0.10

Lom

BUL

0.10

Above 40N

0.10

East Europe

0.1

Croatia

0.09

Dubrovnik

CRO

0.09

Above 40N

0.10

East Europe

0.1

Georgia

0.11

Sakhumi, Abkhazia

GES

0.11

Above 40N

0.10

East Europe

0.1

Romania

0.11

Satu Mare

ROS

0.11

Above 40N

0.10

East Europe

0.1

Russia

0.11

Armavir

RAR

0.12

Above 40N

0.10

East Europe

0.1

Russia

0.11

Krasnodar

KRA

0.08

Above 40N

0.10

East Europe

0.1

Russia

0.11

Sochi

SOC

0.11

Above 40N

0.10

East Europe

0.1

Russia

0.11

Tikhoretsk

TIK

0.13

Above 40N

0.10

East Europe

0.1

Serbia

0.20

Novi Sad

SER

0.20

Above 40N

0.10

East Europe

0.1

Slovenia

0.07

Ajdovscina

SLO

0.07

Above 40N

0.10

East Europe

0.1

Turkey

0.09

Hopa

TUH

0.08

Above 40N

0.10

East Europe

0.1

Ukraine

0.11

Alushta

ALU

0.10

Above 40N

0.10

East Europe

0.1

Ukraine

0.11

Kerch, Crimea

KER

0.09

Above 40N

0.10

East Europe

0.1

Ukraine

0.11

Sevastopol, Crimea

SEV

0.13

Above 40N

0.10

West Europe

0.1

France

0.08

Saint-Martin-d'Heres

FRS

0.07

Above 40N

0.10

West Europe

0.1

France

0.08

Strasbourg

STS

0.08

Above 40N

0.10

West Europe

0.1

Italy

0.10

Bologna

ITB

0.10

Above 40N

0.10

West Europe

0.1

Italy

0.10

Brescia

BRE

0.15

Above 40N

0.10

West Europe

0.1

Italy

0.10

Cesena

CES

0.18

Above 40N

0.10

West Europe

0.1

Italy

0.10

Desenzano

DES

0.13

Above 40N

0.10

West Europe

0.1

Italy

0.10

Imperia

IMP

0.09

Above 40N

0.10

West Europe

0.1

Italy

0.10

Puglia

ITP

0.07

Above 40N

0.10

West Europe

0.1

Italy

0.10

Rome (Sapienza)

ROM

0.08

Above 40N

0.10

West Europe

0.1

Italy

0.10

Rome (Trappola)

ITR

0.07

Above 40N

0.10

West Europe

0.1

Italy

0.10

Trentino

TRE

0.06

Above 40N

0.10

West Europe

0.1

Portugal

0.07

Penafiel

POP

0.07

Above 40N

0.10

West Europe

0.1

Spain

0.10

Barcelona

BAR

0.11

Below 40N

0.09

East Europe

0.1

Greece

0.09

Athens

GRA

0.09

Below 40N

0.09

East Europe

0.1

Greece

0.09

Chania

GRC

0.09

Below 40N

0.09

East Europe

0.1

Turkey

0.09

Aliaga

TUA

0.09

Below 40N

0.09

West Europe

0.1

Italy

0.10

Sicilia

SIC

0.09

Below 40N

0.09

West Europe

0.1

Malta

0.07

Luqa

MAL

0.07

Below 40N

0.09

West Europe

0.1

Spain

0.10

Badajoz

SPB

0.09

Below 40N

0.09

West Europe

0.1

Spain

0.10

Catarroja

SPC

0.10

Below 40N

0.09

West Europe

0.1

Spain

0.10

Magaluf

SPM

0.08

Below 40N

0.09

West Europe

0.1

Spain

0.10

San Roque

SPS

0.12

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "europe",
  "fst", 
  "r_01",
  "fst_Europe_r01_LD2_SNPS.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "europe", "fst", "r_01", "LD2_country_europe_r01.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Save it

ggsave(
  filename = here("output", "europe", "fst", "r_01", "mean_fst_by_country_europe_r01_LD2.pdf"),
  width = 10,
  height = 10,
  units = "in"
)

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           FRS       STS       POP       SPB       SPS       SPC      BAR
## FRS    0.0000  412.1522 1225.4383 1263.7518 1371.4448  817.6702 511.8716
## STS  412.1522    0.0000 1509.1745 1601.1573 1750.4874 1213.5639 915.9614
## POP 1225.4383 1509.1745    0.0000  282.8409  614.5101  701.9619 878.2910
## SPB 1263.7518 1601.1573  282.8409    0.0000  331.7685  571.0509 827.0716
## SPS 1371.4448 1750.4874  614.5101  331.7685    0.0000  566.5153 874.4050
## SPC  817.6702 1213.5639  701.9619  571.0509  566.5153    0.0000 310.0074
##           SPM       IMP       BRE       DES       TRE       ITB       CES
## FRS  683.8845  228.2822  349.1505  375.3876  427.8791  448.0871  524.5893
## STS 1095.6072  527.9620  389.6504  409.5459  380.8019  536.0256  604.0707
## POP  939.4811 1370.3898 1572.0008 1596.9006 1653.2807 1644.8695 1711.3013
## SPB  822.8201 1367.0861 1594.7230 1616.9360 1682.2286 1643.6694 1701.8380
## SPS  787.3372 1423.5497 1670.1969 1688.5822 1761.4648 1690.3169 1736.8728
## SPC  251.7726  857.0395 1104.4760 1122.4598 1195.9602 1124.6677 1173.2270
##           ROM       ITR      SIC      ITP      MAL       SLO      CRO      ALV
## FRS  654.1062  654.1062 1119.260 1004.474 1269.904  628.7258 1037.845 1234.162
## STS  834.5057  834.5057 1314.450 1098.126 1522.503  546.2343 1052.803 1296.908
## POP 1734.4434 1734.4434 2064.584 2102.566 2066.730 1851.8481 2197.100 2334.720
## SPB 1683.0686 1683.0686 1957.316 2040.301 1923.429 1869.2123 2157.626 2266.196
## SPS 1669.1413 1669.1413 1865.247 2002.327 1785.308 1929.6259 2144.409 2215.487
## SPC 1124.2692 1124.2692 1387.383 1474.434 1368.139 1363.2153 1600.981 1697.872
##          ALD      TIR       SER      GRA      GRC      ROS      BUL      TUA
## FRS 1192.210 1216.107 1100.2591 1709.174 1877.379 1341.427 1392.500 1884.818
## STS 1230.239 1248.235  988.6774 1763.684 1974.712 1124.852 1304.105 1885.544
## POP 2319.742 2346.521 2315.5228 2777.366 2876.039 2566.256 2591.385 3000.557
## SPB 2262.516 2289.859 2313.3538 2686.566 2760.206 2594.045 2569.099 2924.845
## SPS 2225.649 2253.121 2340.8167 2601.355 2642.916 2651.131 2568.416 2854.982
## SPC 1697.515 1724.957 1780.6744 2115.539 2193.176 2085.530 2019.617 2354.477
##          TUH      SEV      ALU      KER      KRA      SOC      TIK      RAR
## FRS 2897.030 2179.621 2245.512 2390.859 2596.424 2688.509 2660.271 2758.005
## STS 2745.357 2012.146 2071.666 2197.341 2400.939 2511.584 2447.357 2555.484
## POP 4103.736 3397.015 3464.308 3613.507 3818.891 3906.072 3885.080 3981.365
## SPB 4076.402 3387.944 3457.259 3614.231 3818.203 3894.554 3893.085 3982.756
## SPS 4053.790 3392.537 3463.561 3628.220 3828.858 3891.912 3913.323 3994.595
## SPC 3518.971 2842.701 2913.061 3074.730 3276.925 3346.038 3357.781 3442.342
##          GES      ARM
## FRS 2793.592 3207.499
## STS 2619.862 3045.177
## POP 4009.674 4417.211
## SPB 3995.220 4391.989
## SPS 3988.376 4369.793
## SPC 3444.704 3834.995

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)
##   Distance FST row_index col_index
## 1        0   0       FRS       FRS
## 2        0   0       STS       STS
## 3        0   0       POP       POP
## 4        0   0       SPB       SPB
## 5        0   0       SPS       SPS
## 6        0   0       SPC       SPC

Fit linear regression

# Fit linear model
lm_model <- lm(FST ~ Distance, data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  annotate("text", x = max(data$Distance) * 0.85, y = max(data$FST) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - European populations",
       x = "Distance (Km)",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "r_01", "fst_by_distance_europe_r01_LD2.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

3.4 Subset by country

Select countries with at least 3 sampling localities in Europe & Asia

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops
## [1] "Spain"   "Italy"   "Albania" "Ukraine" "Russia"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  
  # Perform linear regression
  lm_model <- lm(FST ~ Distance, data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $Spain
## $Spain$equation
## [1] "y = 0.00003x + 0.038"
## 
## $Spain$r2
## [1] "R^2 = 0.05"
## 
## 
## $Italy
## $Italy$equation
## [1] "y = 0.00004x + 0.088"
## 
## $Italy$r2
## [1] "R^2 = 0.04"
## 
## 
## $Albania
## $Albania$equation
## [1] "y = 0.00067x + 0.015"
## 
## $Albania$r2
## [1] "R^2 = 0.61"
## 
## 
## $Ukraine
## $Ukraine$equation
## [1] "y = 0.00024x + 0.065"
## 
## $Ukraine$r2
## [1] "R^2 = 0.09"
## 
## 
## $Russia
## $Russia$equation
## [1] "y = 0.00041x + 0.011"
## 
## $Russia$r2
## [1] "R^2 = 0.40"

Calculate linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST ~ Distance, data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("y = %.3fx + %.3f", coef(model)[2], coef(model)[1]),   r2 = sprintf("R^2 = %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(data = regression_results, aes(label = paste(equation, r2, sep = "\n"), x = Inf, y = Inf), 
            vjust = 2, hjust = 2, size = 3.5, inherit.aes = FALSE) +
  labs(title = "FST vs Distance by Country",
       x = "Distance",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "r_01", "fst_by_distance_countries_Europe_r01_LD2.pdf"),
  width = 6,
  height = 8,
  units = "in"
)

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     ALD    ALU    ALV    ARM    BAR    BRE    BUL    CES    CRO    DES   
## ALD NA     "0.12" "0.02" "0.11" "0.12" "0.17" "0.11" "0.20" "0.05" "0.16"
## ALU "1268" NA     "0.10" "0.06" "0.11" "0.16" "0.10" "0.19" "0.09" "0.15"
## ALV "93"   "1307" NA     "0.09" "0.10" "0.16" "0.09" "0.18" "0.03" "0.14"
## ARM "2144" "974"  "2159" NA     "0.11" "0.13" "0.09" "0.16" "0.08" "0.11"
## BAR "1446" "2631" "1458" "3566" NA     "0.18" "0.11" "0.21" "0.09" "0.17"
## BRE "887"  "1896" "942"  "2859" "796"  NA     "0.15" "0.16" "0.15" "0.09"
## BUL "414"  "895"  "483"  "1826" "1742" "1048" NA     "0.19" "0.09" "0.14"
## CES "673"  "1757" "724"  "2706" "878"  "223"  "881"  NA     "0.18" "0.14"
## CRO "180"  "1321" "260"  "2235" "1333" "719"  "428"  "514"  NA     "0.13"
## DES "859"  "1870" "915"  "2832" "815"  "28"   "1021" "198"  "691"  NA    
## FRS "1192" "2246" "1234" "3207" "512"  "349"  "1393" "525"  "1038" "375" 
## GES "1770" "550"  "1795" "429"  "3168" "2444" "1426" "2300" "1845" "2418"
## GRA "534"  "1151" "476"  "1848" "1900" "1417" "656"  "1200" "711"  "1390"
## GRC "754"  "1348" "679"  "1937" "2007" "1611" "925"  "1389" "934"  "1585"
## IMP "984"  "2097" "1020" "3047" "552"  "254"  "1221" "341"  "840"  "268" 
## ITB "752"  "1821" "802"  "2773" "824"  "148"  "950"  "79"   "591"  "126" 
## ITP "224"  "1482" "235"  "2367" "1227" "728"  "604"  "506"  "201"  "702" 
## ITR "585"  "1794" "605"  "2711" "862"  "445"  "899"  "252"  "477"  "427" 
## KER "1443" "179"  "1484" "862"  "2789" "2042" "1062" "1913" "1490" "2017"
## KRA "1633" "366"  "1670" "674"  "2992" "2248" "1261" "2117" "1687" "2222"
## MAL "746"  "1948" "674"  "2719" "1233" "1135" "1157" "943"  "817"  "1119"
## POP "2320" "3464" "2335" "4417" "878"  "1572" "2591" "1711" "2197" "1597"
## RAR "1798" "532"  "1833" "560"  "3157" "2410" "1427" "2281" "1853" "2385"
## ROM "585"  "1794" "605"  "2711" "862"  "445"  "899"  "252"  "477"  "427" 
## ROS "771"  "950"  "859"  "1923" "1783" "1000" "445"  "917"  "683"  "977" 
## SER "442"  "1149" "534"  "2107" "1489" "752"  "316"  "612"  "322"  "725" 
## SEV "1196" "72"   "1235" "1034" "2561" "1830" "824"  "1689" "1250" "1804"
## SIC "480"  "1724" "420"  "2545" "1195" "925"  "894"  "715"  "537"  "905" 
## SLO "689"  "1617" "761"  "2581" "1059" "280"  "782"  "227"  "509"  "255" 
## SOC "1678" "443"  "1706" "538"  "3068" "2339" "1327" "2197" "1748" "2313"
## SPB "2263" "3457" "2266" "4392" "827"  "1595" "2569" "1702" "2158" "1617"
## SPC "1698" "2913" "1698" "3835" "310"  "1104" "2020" "1173" "1601" "1122"
## SPM "1450" "2677" "1448" "3591" "211"  "919"  "1782" "956"  "1360" "933" 
## SPS "2226" "3464" "2215" "4370" "874"  "1670" "2568" "1737" "2144" "1689"
## STS "1230" "2072" "1297" "3045" "916"  "390"  "1304" "604"  "1053" "410" 
## TIK "1733" "467"  "1774" "687"  "3069" "2313" "1350" "2191" "1778" "2288"
## TIR "27"   "1242" "99"   "2117" "1473" "908"  "393"  "696"  "196"  "881" 
## TRE "858"  "1821" "920"  "2787" "887"  "92"   "986"  "230"  "684"  "80"  
## TUA "694"  "904"  "667"  "1571" "2123" "1569" "641"  "1363" "851"  "1542"
## TUH "1828" "679"  "1844" "316"  "3251" "2549" "1512" "2393" "1919" "2522"
##     FRS    GES    GRA    GRC    IMP    ITB    ITP    ITR    KER    KRA   
## ALD "0.09" "0.15" "0.08" "0.08" "0.11" "0.13" "0.08" "0.08" "0.12" "0.11"
## ALU "0.08" "0.09" "0.08" "0.09" "0.09" "0.11" "0.08" "0.07" "0.06" "0.05"
## ALV "0.07" "0.13" "0.06" "0.05" "0.08" "0.10" "0.06" "0.06" "0.10" "0.10"
## ARM "0.06" "0.06" "0.08" "0.08" "0.08" "0.09" "0.07" "0.07" "0.04" "0.02"
## BAR "0.09" "0.14" "0.09" "0.10" "0.11" "0.13" "0.08" "0.07" "0.12" "0.11"
## BRE "0.12" "0.18" "0.15" "0.15" "0.13" "0.12" "0.12" "0.14" "0.15" "0.15"
## BUL "0.07" "0.13" "0.09" "0.09" "0.09" "0.10" "0.06" "0.05" "0.11" "0.10"
## CES "0.15" "0.21" "0.18" "0.19" "0.18" "0.17" "0.15" "0.17" "0.18" "0.18"
## CRO "0.06" "0.12" "0.06" "0.05" "0.08" "0.09" "0.05" "0.05" "0.10" "0.09"
## DES "0.11" "0.17" "0.14" "0.14" "0.13" "0.11" "0.11" "0.12" "0.14" "0.14"
## FRS NA     "0.11" "0.07" "0.07" "0.05" "0.06" "0.04" "0.04" "0.08" "0.08"
## GES "2794" NA     "0.11" "0.12" "0.13" "0.14" "0.11" "0.10" "0.05" "0.02"
## GRA "1709" "1540" NA     "0.02" "0.08" "0.10" "0.05" "0.05" "0.09" "0.08"
## GRC "1877" "1675" "270"  NA     "0.09" "0.10" "0.06" "0.06" "0.10" "0.09"
## IMP "228"  "2640" "1492" "1653" NA     "0.08" "0.05" "0.05" "0.09" "0.09"
## ITB "448"  "2365" "1279" "1467" "277"  NA     "0.07" "0.07" "0.11" "0.11"
## ITP "1004" "1991" "705"  "884"  "787"  "583"  NA     "0.02" "0.08" "0.08"
## ITR "654"  "2321" "1070" "1224" "428"  "302"  "371"  NA     "0.08" "0.07"
## KER "2391" "434"  "1327" "1517" "2250" "1973" "1655" "1960" NA     "0.03"
## KRA "2596" "257"  "1478" "1648" "2455" "2178" "1848" "2160" "206"  NA    
## MAL "1270" "2395" "873"  "862"  "1049" "996"  "621"  "694"  "2127" "2301"
## POP "1225" "4010" "2777" "2876" "1370" "1645" "2103" "1734" "3614" "3819"
## RAR "2758" "210"  "1629" "1789" "2619" "2342" "2014" "2326" "369"  "166" 
## ROM "654"  "2321" "1070" "1224" "428"  "302"  "371"  "0"    "1960" "2160"
## ROS "1341" "1496" "1100" "1370" "1232" "961"  "884"  "1047" "1073" "1276"
## SER "1100" "1695" "884"  "1141" "949"  "672"  "520"  "697"  "1301" "1506"
## SEV "2180" "614"  "1086" "1288" "2029" "1753" "1410" "1723" "250"  "438" 
## SIC "1119" "2197" "736"  "812"  "891"  "779"  "340"  "483"  "1901" "2084"
## SLO "629"  "2166" "1220" "1440" "507"  "245"  "585"  "455"  "1762" "1968"
## SOC "2689" "109"  "1470" "1618" "2537" "2262" "1898" "2224" "325"  "161" 
## SPB "1264" "3995" "2687" "2760" "1367" "1644" "2040" "1683" "3614" "3818"
## SPC "818"  "3445" "2116" "2193" "857"  "1125" "1474" "1124" "3075" "3277"
## SPM "684"  "3204" "1864" "1944" "665"  "916"  "1227" "883"  "2841" "3042"
## SPS "1371" "3988" "2601" "2643" "1424" "1690" "2002" "1669" "3628" "3829"
## STS "412"  "2620" "1764" "1975" "528"  "536"  "1098" "835"  "2197" "2401"
## TIK "2660" "315"  "1600" "1776" "2527" "2250" "1945" "2247" "290"  "132" 
## TIR "1216" "1743" "515"  "742"  "1009" "775"  "251"  "612"  "1417" "1607"
## TRE "428"  "2370" "1391" "1596" "346"  "176"  "717"  "476"  "1964" "2170"
## TUA "1885" "1265" "277"  "445"  "1678" "1441" "901"  "1271" "1075" "1214"
## TUH "2897" "194"  "1544" "1650" "2734" "2461" "2051" "2395" "597"  "443" 
##     MAL    POP    RAR    ROM    ROS    SER    SEV    SIC    SLO    SOC   
## ALD "0.08" "0.09" "0.15" "0.10" "0.12" "0.22" "0.16" "0.11" "0.08" "0.14"
## ALU "0.07" "0.08" "0.08" "0.08" "0.11" "0.20" "0.10" "0.10" "0.07" "0.07"
## ALV "0.06" "0.07" "0.13" "0.07" "0.10" "0.18" "0.14" "0.09" "0.06" "0.12"
## ARM "0.06" "0.05" "0.05" "0.08" "0.09" "0.19" "0.07" "0.08" "0.05" "0.04"
## BAR "0.08" "0.09" "0.14" "0.08" "0.13" "0.22" "0.16" "0.10" "0.08" "0.14"
## BRE "0.12" "0.11" "0.19" "0.15" "0.14" "0.25" "0.20" "0.12" "0.10" "0.18"
## BUL "0.06" "0.07" "0.14" "0.07" "0.10" "0.19" "0.15" "0.08" "0.07" "0.13"
## CES "0.15" "0.13" "0.22" "0.19" "0.17" "0.28" "0.23" "0.16" "0.14" "0.21"
## CRO "0.06" "0.06" "0.12" "0.05" "0.10" "0.17" "0.14" "0.08" "0.06" "0.12"
## DES "0.11" "0.08" "0.18" "0.14" "0.12" "0.23" "0.19" "0.09" "0.08" "0.17"
## FRS "0.04" "0.04" "0.11" "0.04" "0.07" "0.16" "0.12" "0.06" "0.04" "0.10"
## GES "0.10" "0.11" "0.03" "0.11" "0.14" "0.23" "0.05" "0.12" "0.10" "0.01"
## GRA "0.05" "0.07" "0.12" "0.05" "0.10" "0.19" "0.13" "0.08" "0.05" "0.11"
## GRC "0.06" "0.07" "0.12" "0.06" "0.10" "0.19" "0.13" "0.09" "0.06" "0.12"
## IMP "0.04" "0.05" "0.13" "0.06" "0.09" "0.22" "0.15" "0.07" "0.05" "0.12"
## ITB "0.06" "0.05" "0.15" "0.08" "0.10" "0.22" "0.16" "0.08" "0.06" "0.13"
## ITP "0.02" "0.04" "0.11" "0.02" "0.08" "0.17" "0.12" "0.04" "0.04" "0.10"
## ITR "0.02" "0.03" "0.10" "0.01" "0.08" "0.16" "0.12" "0.05" "0.04" "0.10"
## KER "0.08" "0.07" "0.05" "0.09" "0.11" "0.20" "0.07" "0.10" "0.07" "0.05"
## KRA "0.07" "0.07" "0.01" "0.08" "0.11" "0.20" "0.03" "0.09" "0.07" "0.01"
## MAL NA     "0.03" "0.11" "0.02" "0.07" "0.16" "0.12" "0.05" "0.04" "0.10"
## POP "2067" NA     "0.11" "0.05" "0.07" "0.17" "0.12" "0.05" "0.04" "0.10"
## RAR "2460" "3981" NA     "0.12" "0.14" "0.23" "0.05" "0.13" "0.10" "0.02"
## ROM "694"  "1734" "2326" NA     "0.09" "0.21" "0.13" "0.06" "0.04" "0.11"
## ROS "1498" "2566" "1432" "1047" NA     "0.20" "0.16" "0.09" "0.06" "0.13"
## SER "1139" "2316" "1670" "697"  "367"  NA     "0.25" "0.19" "0.15" "0.23"
## SEV "1878" "3397" "604"  "1723" "894"  "1082" NA     "0.14" "0.11" "0.05"
## SIC "280"  "2065" "2246" "483"  "1219" "858"  "1652" NA     "0.05" "0.12"
## SLO "1118" "1852" "2130" "455"  "726"  "476"  "1551" "865"  NA     "0.09"
## SOC "2316" "3906" "187"  "2224" "1388" "1591" "509"  "2113" "2060" NA    
## SPB "1923" "283"  "3983" "1683" "2594" "2313" "3388" "1957" "1869" "3895"
## SPC "1368" "702"  "3442" "1124" "2086" "1781" "2843" "1387" "1363" "3346"
## SPM "1128" "939"  "3208" "883"  "1873" "1555" "2606" "1136" "1160" "3107"
## SPS "1785" "615"  "3995" "1669" "2651" "2341" "3393" "1865" "1930" "3892"
## STS "1523" "1509" "2555" "835"  "1125" "989"  "2012" "1314" "546"  "2512"
## TIK "2414" "3885" "127"  "2247" "1328" "1580" "539"  "2190" "2033" "253" 
## TIR "764"  "2347" "1772" "612"  "761"  "439"  "1170" "502"  "705"  "1651"
## TRE "1170" "1653" "2331" "476"  "915"  "683"  "1756" "945"  "206"  "2264"
## TUA "1148" "3001" "1361" "1271" "1057" "932"  "843"  "994"  "1340" "1198"
## TUH "2412" "4104" "398"  "2395" "1629" "1797" "734"  "2233" "2272" "283" 
##     SPB    SPC    SPM    SPS    STS    TIK    TIR     TRE    TUA    TUH   
## ALD "0.12" "0.13" "0.10" "0.14" "0.10" "0.16" "0.03"  "0.08" "0.10" "0.09"
## ALU "0.10" "0.11" "0.09" "0.13" "0.09" "0.09" "0.10"  "0.07" "0.09" "0.08"
## ALV "0.10" "0.11" "0.08" "0.11" "0.07" "0.14" "-0.00" "0.06" "0.08" "0.07"
## ARM "0.06" "0.07" "0.06" "0.12" "0.07" "0.07" "0.09"  "0.05" "0.09" "0.07"
## BAR "0.12" "0.13" "0.10" "0.11" "0.09" "0.16" "0.10"  "0.08" "0.06" "0.04"
## BRE "0.10" "0.12" "0.11" "0.18" "0.12" "0.19" "0.17"  "0.09" "0.16" "0.13"
## BUL "0.10" "0.10" "0.08" "0.12" "0.08" "0.15" "0.10"  "0.06" "0.08" "0.07"
## CES "0.13" "0.15" "0.14" "0.22" "0.15" "0.22" "0.21"  "0.11" "0.19" "0.16"
## CRO "0.09" "0.10" "0.08" "0.11" "0.06" "0.13" "0.02"  "0.05" "0.08" "0.07"
## DES "0.08" "0.10" "0.09" "0.17" "0.11" "0.18" "0.15"  "0.07" "0.14" "0.12"
## FRS "0.06" "0.07" "0.05" "0.10" "0.03" "0.12" "0.07"  "0.03" "0.07" "0.05"
## GES "0.12" "0.13" "0.12" "0.16" "0.11" "0.05" "0.14"  "0.10" "0.12" "0.11"
## GRA "0.09" "0.10" "0.08" "0.10" "0.07" "0.13" "0.06"  "0.06" "0.07" "0.06"
## GRC "0.09" "0.10" "0.08" "0.11" "0.07" "0.14" "0.06"  "0.06" "0.08" "0.07"
## IMP "0.07" "0.08" "0.07" "0.10" "0.05" "0.15" "0.10"  "0.04" "0.08" "0.06"
## ITB "0.08" "0.09" "0.07" "0.13" "0.07" "0.16" "0.11"  "0.05" "0.11" "0.08"
## ITP "0.07" "0.08" "0.05" "0.09" "0.04" "0.12" "0.06"  "0.03" "0.06" "0.04"
## ITR "0.08" "0.09" "0.06" "0.07" "0.04" "0.12" "0.06"  "0.04" "0.04" "0.03"
## KER "0.09" "0.10" "0.09" "0.13" "0.08" "0.07" "0.11"  "0.07" "0.10" "0.08"
## KRA "0.09" "0.10" "0.08" "0.13" "0.08" "0.04" "0.10"  "0.07" "0.09" "0.08"
## MAL "0.06" "0.07" "0.05" "0.07" "0.04" "0.12" "0.07"  "0.03" "0.05" "0.04"
## POP "0.05" "0.05" "0.04" "0.09" "0.04" "0.12" "0.07"  "0.03" "0.06" "0.05"
## RAR "0.13" "0.14" "0.12" "0.16" "0.12" "0.05" "0.14"  "0.11" "0.13" "0.11"
## ROM "0.08" "0.09" "0.07" "0.09" "0.04" "0.13" "0.07"  "0.03" "0.06" "0.04"
## ROS "0.09" "0.10" "0.09" "0.13" "0.08" "0.15" "0.12"  "0.05" "0.10" "0.08"
## SER "0.20" "0.22" "0.21" "0.24" "0.17" "0.25" "0.22"  "0.15" "0.20" "0.17"
## SEV "0.14" "0.15" "0.14" "0.17" "0.13" "0.08" "0.15"  "0.12" "0.14" "0.12"
## SIC "0.08" "0.09" "0.06" "0.11" "0.06" "0.14" "0.10"  "0.04" "0.08" "0.06"
## SLO "0.05" "0.06" "0.04" "0.09" "0.04" "0.11" "0.06"  "0.02" "0.06" "0.04"
## SOC "0.12" "0.12" "0.11" "0.15" "0.11" "0.06" "0.13"  "0.10" "0.12" "0.10"
## SPB NA     "0.01" "0.01" "0.13" "0.06" "0.14" "0.10"  "0.04" "0.10" "0.08"
## SPC "571"  NA     "0.02" "0.13" "0.07" "0.15" "0.11"  "0.05" "0.10" "0.08"
## SPM "823"  "252"  NA     "0.06" "0.05" "0.13" "0.10"  "0.03" "0.08" "0.06"
## SPS "332"  "567"  "787"  NA     "0.10" "0.17" "0.13"  "0.08" "0.03" "0.09"
## STS "1601" "1214" "1096" "1750" NA     "0.13" "0.08"  "0.03" "0.07" "0.05"
## TIK "3893" "3358" "3126" "3913" "2447" NA     "0.15"  "0.11" "0.14" "0.12"
## TIR "2290" "1725" "1478" "2253" "1248" "1707" NA      "0.06" "0.08" "0.07"
## TRE "1682" "1196" "1011" "1761" "381"  "2233" "877"   NA     "0.06" "0.05"
## TUA "2925" "2354" "2103" "2855" "1886" "1339" "669"   "1529" NA     "0.05"
## TUH "4076" "3519" "3275" "4054" "2745" "508"  "1801"  "2478" "1267" NA
cities <- readRDS(here("output", "sampling_loc_all.rds"))
cities <- as_tibble(cities)
head(cities)
## # A tibble: 6 × 10
##   Pop_City     Country    Latitude Longitude Continent Abbreviation Year  Region
##   <chr>        <chr>         <dbl>     <dbl> <chr>     <chr>        <chr> <chr> 
## 1 Franceville  Gabon         -1.59      13.5 Africa    GAB          2015  Centr…
## 2 Antananarivo Madagascar   -18.9       47.5 Africa    ANT          2022  East …
## 3 Diego ville  Madagascar   -12.3       49.3 Africa    DGV          2022  East …
## 4 Morondava    Madagascar   -20.3       44.3 Africa    MAD          2016  East …
## 5 Vohimasy     Madagascar   -22.8       47.8 Africa    VOH          2016… East …
## 6 Dauguet      Mauritius    -20.2       57.5 Africa    DAU          2022  India…
## # ℹ 2 more variables: Subregion <chr>, order <int>

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     CRO    ALD    ITP    TIR    SER    ALV    CES    ITR    ROM    SLO   
## CRO NA     "0.05" "0.05" "0.02" "0.17" "0.03" "0.18" "0.05" "0.05" "0.06"
## ALD "180"  NA     "0.08" "0.03" "0.22" "0.02" "0.20" "0.08" "0.10" "0.08"
## ITP "201"  "224"  NA     "0.06" "0.17" "0.06" "0.15" "0.02" "0.02" "0.04"
## TIR "196"  "27"   "251"  NA     "0.22" "0.00" "0.21" "0.06" "0.07" "0.06"
## SER "322"  "442"  "520"  "439"  NA     "0.18" "0.28" "0.16" "0.21" "0.15"
## ALV "260"  "93"   "235"  "99"   "534"  NA     "0.18" "0.06" "0.07" "0.06"
## CES "514"  "673"  "506"  "696"  "612"  "724"  NA     "0.17" "0.19" "0.14"
## ITR "477"  "585"  "371"  "612"  "697"  "605"  "252"  NA     "0.01" "0.04"
## ROM "477"  "585"  "371"  "612"  "697"  "605"  "252"  "0"    NA     "0.04"
## SLO "509"  "689"  "585"  "705"  "476"  "761"  "227"  "455"  "455"  NA    
## ITB "591"  "752"  "583"  "775"  "672"  "802"  "79"   "302"  "302"  "245" 
## BUL "428"  "414"  "604"  "393"  "316"  "483"  "881"  "899"  "899"  "782" 
## DES "691"  "859"  "702"  "881"  "725"  "915"  "198"  "427"  "427"  "255" 
## TRE "684"  "858"  "717"  "877"  "683"  "920"  "230"  "476"  "476"  "206" 
## BRE "719"  "887"  "728"  "908"  "752"  "942"  "223"  "445"  "445"  "280" 
## SIC "537"  "480"  "340"  "502"  "858"  "420"  "715"  "483"  "483"  "865" 
## ROS "683"  "771"  "884"  "761"  "367"  "859"  "917"  "1047" "1047" "726" 
## IMP "840"  "984"  "787"  "1009" "949"  "1020" "341"  "428"  "428"  "507" 
## GRA "711"  "534"  "705"  "515"  "884"  "476"  "1200" "1070" "1070" "1220"
## TUA "851"  "694"  "901"  "669"  "932"  "667"  "1363" "1271" "1271" "1340"
## FRS "1038" "1192" "1004" "1216" "1100" "1234" "525"  "654"  "654"  "629" 
## MAL "817"  "746"  "621"  "764"  "1139" "674"  "943"  "694"  "694"  "1118"
## STS "1053" "1230" "1098" "1248" "989"  "1297" "604"  "835"  "835"  "546" 
## GRC "934"  "754"  "884"  "742"  "1141" "679"  "1389" "1224" "1224" "1440"
## SEV "1250" "1196" "1410" "1170" "1082" "1235" "1689" "1723" "1723" "1551"
## ALU "1321" "1268" "1482" "1242" "1149" "1307" "1757" "1794" "1794" "1617"
## BAR "1333" "1446" "1227" "1473" "1489" "1458" "878"  "862"  "862"  "1059"
## SPM "1360" "1450" "1227" "1478" "1555" "1448" "956"  "883"  "883"  "1160"
## KER "1490" "1443" "1655" "1417" "1301" "1484" "1913" "1960" "1960" "1762"
## KRA "1687" "1633" "1848" "1607" "1506" "1670" "2117" "2160" "2160" "1968"
## SPC "1601" "1698" "1474" "1725" "1781" "1698" "1173" "1124" "1124" "1363"
## SOC "1748" "1678" "1898" "1651" "1591" "1706" "2197" "2224" "2224" "2060"
## TIK "1778" "1733" "1945" "1707" "1580" "1774" "2191" "2247" "2247" "2033"
## GES "1845" "1770" "1991" "1743" "1695" "1795" "2300" "2321" "2321" "2166"
## RAR "1853" "1798" "2014" "1772" "1670" "1833" "2281" "2326" "2326" "2130"
## TUH "1919" "1828" "2051" "1801" "1797" "1844" "2393" "2395" "2395" "2272"
## ARM "2235" "2144" "2367" "2117" "2107" "2159" "2706" "2711" "2711" "2581"
## SPB "2158" "2263" "2040" "2290" "2313" "2266" "1702" "1683" "1683" "1869"
## SPS "2144" "2226" "2002" "2253" "2341" "2215" "1737" "1669" "1669" "1930"
## POP "2197" "2320" "2103" "2347" "2316" "2335" "1711" "1734" "1734" "1852"
##     ITB    BUL    DES    TRE    BRE    SIC    ROS    IMP    GRA    TUA   
## CRO "0.09" "0.09" "0.13" "0.05" "0.15" "0.08" "0.10" "0.08" "0.06" "0.08"
## ALD "0.13" "0.11" "0.16" "0.08" "0.17" "0.11" "0.12" "0.11" "0.08" "0.10"
## ITP "0.07" "0.06" "0.11" "0.03" "0.12" "0.04" "0.08" "0.05" "0.05" "0.06"
## TIR "0.11" "0.10" "0.15" "0.06" "0.17" "0.10" "0.12" "0.10" "0.06" "0.08"
## SER "0.22" "0.19" "0.23" "0.15" "0.25" "0.19" "0.20" "0.22" "0.19" "0.20"
## ALV "0.10" "0.09" "0.14" "0.06" "0.16" "0.09" "0.10" "0.08" "0.06" "0.08"
## CES "0.17" "0.19" "0.14" "0.11" "0.16" "0.16" "0.17" "0.18" "0.18" "0.19"
## ITR "0.07" "0.05" "0.12" "0.04" "0.14" "0.05" "0.08" "0.05" "0.05" "0.04"
## ROM "0.08" "0.07" "0.14" "0.03" "0.15" "0.06" "0.09" "0.06" "0.05" "0.06"
## SLO "0.06" "0.07" "0.08" "0.02" "0.10" "0.05" "0.06" "0.05" "0.05" "0.06"
## ITB NA     "0.10" "0.11" "0.05" "0.12" "0.08" "0.10" "0.08" "0.10" "0.11"
## BUL "950"  NA     "0.14" "0.06" "0.15" "0.08" "0.10" "0.09" "0.09" "0.08"
## DES "126"  "1021" NA     "0.07" "0.09" "0.09" "0.12" "0.13" "0.14" "0.14"
## TRE "176"  "986"  "80"   NA     "0.09" "0.04" "0.05" "0.04" "0.06" "0.06"
## BRE "148"  "1048" "28"   "92"   NA     "0.12" "0.14" "0.13" "0.15" "0.16"
## SIC "779"  "894"  "905"  "945"  "925"  NA     "0.09" "0.07" "0.08" "0.08"
## ROS "961"  "445"  "977"  "915"  "1000" "1219" NA     "0.09" "0.10" "0.10"
## IMP "277"  "1221" "268"  "346"  "254"  "891"  "1232" NA     "0.08" "0.08"
## GRA "1279" "656"  "1390" "1391" "1417" "736"  "1100" "1492" NA     "0.07"
## TUA "1441" "641"  "1542" "1529" "1569" "994"  "1057" "1678" "277"  NA    
## FRS "448"  "1393" "375"  "428"  "349"  "1119" "1341" "228"  "1709" "1885"
## MAL "996"  "1157" "1119" "1170" "1135" "280"  "1498" "1049" "873"  "1148"
## STS "536"  "1304" "410"  "381"  "390"  "1314" "1125" "528"  "1764" "1886"
## GRC "1467" "925"  "1585" "1596" "1611" "812"  "1370" "1653" "270"  "445" 
## SEV "1753" "824"  "1804" "1756" "1830" "1652" "894"  "2029" "1086" "843" 
## ALU "1821" "895"  "1870" "1821" "1896" "1724" "950"  "2097" "1151" "904" 
## BAR "824"  "1742" "815"  "887"  "796"  "1195" "1783" "552"  "1900" "2123"
## SPM "916"  "1782" "933"  "1011" "919"  "1136" "1873" "665"  "1864" "2103"
## KER "1973" "1062" "2017" "1964" "2042" "1901" "1073" "2250" "1327" "1075"
## KRA "2178" "1261" "2222" "2170" "2248" "2084" "1276" "2455" "1478" "1214"
## SPC "1125" "2020" "1122" "1196" "1104" "1387" "2086" "857"  "2116" "2354"
## SOC "2262" "1327" "2313" "2264" "2339" "2113" "1388" "2537" "1470" "1198"
## TIK "2250" "1350" "2288" "2233" "2313" "2190" "1328" "2527" "1600" "1339"
## GES "2365" "1426" "2418" "2370" "2444" "2197" "1496" "2640" "1540" "1265"
## RAR "2342" "1427" "2385" "2331" "2410" "2246" "1432" "2619" "1629" "1361"
## TUH "2461" "1512" "2522" "2478" "2549" "2233" "1629" "2734" "1544" "1267"
## ARM "2773" "1826" "2832" "2787" "2859" "2545" "1923" "3047" "1848" "1571"
## SPB "1644" "2569" "1617" "1682" "1595" "1957" "2594" "1367" "2687" "2925"
## SPS "1690" "2568" "1689" "1761" "1670" "1865" "2651" "1424" "2601" "2855"
## POP "1645" "2591" "1597" "1653" "1572" "2065" "2566" "1370" "2777" "3001"
##     FRS    MAL    STS    GRC    SEV    ALU    BAR    SPM    KER    KRA   
## CRO "0.06" "0.06" "0.06" "0.05" "0.14" "0.09" "0.09" "0.08" "0.10" "0.09"
## ALD "0.09" "0.08" "0.10" "0.08" "0.16" "0.12" "0.12" "0.10" "0.12" "0.11"
## ITP "0.04" "0.02" "0.04" "0.06" "0.12" "0.08" "0.08" "0.05" "0.08" "0.08"
## TIR "0.07" "0.07" "0.08" "0.06" "0.15" "0.10" "0.10" "0.10" "0.11" "0.10"
## SER "0.16" "0.16" "0.17" "0.19" "0.25" "0.20" "0.22" "0.21" "0.20" "0.20"
## ALV "0.07" "0.06" "0.07" "0.05" "0.14" "0.10" "0.10" "0.08" "0.10" "0.10"
## CES "0.15" "0.15" "0.15" "0.19" "0.23" "0.19" "0.21" "0.14" "0.18" "0.18"
## ITR "0.04" "0.02" "0.04" "0.06" "0.12" "0.07" "0.07" "0.06" "0.08" "0.07"
## ROM "0.04" "0.02" "0.04" "0.06" "0.13" "0.08" "0.08" "0.07" "0.09" "0.08"
## SLO "0.04" "0.04" "0.04" "0.06" "0.11" "0.07" "0.08" "0.04" "0.07" "0.07"
## ITB "0.06" "0.06" "0.07" "0.10" "0.16" "0.11" "0.13" "0.07" "0.11" "0.11"
## BUL "0.07" "0.06" "0.08" "0.09" "0.15" "0.10" "0.11" "0.08" "0.11" "0.10"
## DES "0.11" "0.11" "0.11" "0.14" "0.19" "0.15" "0.17" "0.09" "0.14" "0.14"
## TRE "0.03" "0.03" "0.03" "0.06" "0.12" "0.07" "0.08" "0.03" "0.07" "0.07"
## BRE "0.12" "0.12" "0.12" "0.15" "0.20" "0.16" "0.18" "0.11" "0.15" "0.15"
## SIC "0.06" "0.05" "0.06" "0.09" "0.14" "0.10" "0.10" "0.06" "0.10" "0.09"
## ROS "0.07" "0.07" "0.08" "0.10" "0.16" "0.11" "0.13" "0.09" "0.11" "0.11"
## IMP "0.05" "0.04" "0.05" "0.09" "0.15" "0.09" "0.11" "0.07" "0.09" "0.09"
## GRA "0.07" "0.05" "0.07" "0.02" "0.13" "0.08" "0.09" "0.08" "0.09" "0.08"
## TUA "0.07" "0.05" "0.07" "0.08" "0.14" "0.09" "0.06" "0.08" "0.10" "0.09"
## FRS NA     "0.04" "0.03" "0.07" "0.12" "0.08" "0.09" "0.05" "0.08" "0.08"
## MAL "1270" NA     "0.04" "0.06" "0.12" "0.07" "0.08" "0.05" "0.08" "0.07"
## STS "412"  "1523" NA     "0.07" "0.13" "0.09" "0.09" "0.05" "0.08" "0.08"
## GRC "1877" "862"  "1975" NA     "0.13" "0.09" "0.10" "0.08" "0.10" "0.09"
## SEV "2180" "1878" "2012" "1288" NA     "0.10" "0.16" "0.14" "0.07" "0.03"
## ALU "2246" "1948" "2072" "1348" "72"   NA     "0.11" "0.09" "0.06" "0.05"
## BAR "512"  "1233" "916"  "2007" "2561" "2631" NA     "0.10" "0.12" "0.11"
## SPM "684"  "1128" "1096" "1944" "2606" "2677" "211"  NA     "0.09" "0.08"
## KER "2391" "2127" "2197" "1517" "250"  "179"  "2789" "2841" NA     "0.03"
## KRA "2596" "2301" "2401" "1648" "438"  "366"  "2992" "3042" "206"  NA    
## SPC "818"  "1368" "1214" "2193" "2843" "2913" "310"  "252"  "3075" "3277"
## SOC "2689" "2316" "2512" "1618" "509"  "443"  "3068" "3107" "325"  "161" 
## TIK "2660" "2414" "2447" "1776" "539"  "467"  "3069" "3126" "290"  "132" 
## GES "2794" "2395" "2620" "1675" "614"  "550"  "3168" "3204" "434"  "257" 
## RAR "2758" "2460" "2555" "1789" "604"  "532"  "3157" "3208" "369"  "166" 
## TUH "2897" "2412" "2745" "1650" "734"  "679"  "3251" "3275" "597"  "443" 
## ARM "3207" "2719" "3045" "1937" "1034" "974"  "3566" "3591" "862"  "674" 
## SPB "1264" "1923" "1601" "2760" "3388" "3457" "827"  "823"  "3614" "3818"
## SPS "1371" "1785" "1750" "2643" "3393" "3464" "874"  "787"  "3628" "3829"
## POP "1225" "2067" "1509" "2876" "3397" "3464" "878"  "939"  "3614" "3819"
##     SPC    SOC    TIK    GES    RAR    TUH    ARM    SPB    SPS    POP   
## CRO "0.10" "0.12" "0.13" "0.12" "0.12" "0.07" "0.08" "0.09" "0.11" "0.06"
## ALD "0.13" "0.14" "0.16" "0.15" "0.15" "0.09" "0.11" "0.12" "0.14" "0.09"
## ITP "0.08" "0.10" "0.12" "0.11" "0.11" "0.04" "0.07" "0.07" "0.09" "0.04"
## TIR "0.11" "0.13" "0.15" "0.14" "0.14" "0.07" "0.09" "0.10" "0.13" "0.07"
## SER "0.22" "0.23" "0.25" "0.23" "0.23" "0.17" "0.19" "0.20" "0.24" "0.17"
## ALV "0.11" "0.12" "0.14" "0.13" "0.13" "0.07" "0.09" "0.10" "0.11" "0.07"
## CES "0.15" "0.21" "0.22" "0.21" "0.22" "0.16" "0.16" "0.13" "0.22" "0.13"
## ITR "0.09" "0.10" "0.12" "0.10" "0.10" "0.03" "0.07" "0.08" "0.07" "0.03"
## ROM "0.09" "0.11" "0.13" "0.11" "0.12" "0.04" "0.08" "0.08" "0.09" "0.05"
## SLO "0.06" "0.09" "0.11" "0.10" "0.10" "0.04" "0.05" "0.05" "0.09" "0.04"
## ITB "0.09" "0.13" "0.16" "0.14" "0.15" "0.08" "0.09" "0.08" "0.13" "0.05"
## BUL "0.10" "0.13" "0.15" "0.13" "0.14" "0.07" "0.09" "0.10" "0.12" "0.07"
## DES "0.10" "0.17" "0.18" "0.17" "0.18" "0.12" "0.11" "0.08" "0.17" "0.08"
## TRE "0.05" "0.10" "0.11" "0.10" "0.11" "0.05" "0.05" "0.04" "0.08" "0.03"
## BRE "0.12" "0.18" "0.19" "0.18" "0.19" "0.13" "0.13" "0.10" "0.18" "0.11"
## SIC "0.09" "0.12" "0.14" "0.12" "0.13" "0.06" "0.08" "0.08" "0.11" "0.05"
## ROS "0.10" "0.13" "0.15" "0.14" "0.14" "0.08" "0.09" "0.09" "0.13" "0.07"
## IMP "0.08" "0.12" "0.15" "0.13" "0.13" "0.06" "0.08" "0.07" "0.10" "0.05"
## GRA "0.10" "0.11" "0.13" "0.11" "0.12" "0.06" "0.08" "0.09" "0.10" "0.07"
## TUA "0.10" "0.12" "0.14" "0.12" "0.13" "0.05" "0.09" "0.10" "0.03" "0.06"
## FRS "0.07" "0.10" "0.12" "0.11" "0.11" "0.05" "0.06" "0.06" "0.10" "0.04"
## MAL "0.07" "0.10" "0.12" "0.10" "0.11" "0.04" "0.06" "0.06" "0.07" "0.03"
## STS "0.07" "0.11" "0.13" "0.11" "0.12" "0.05" "0.07" "0.06" "0.10" "0.04"
## GRC "0.10" "0.12" "0.14" "0.12" "0.12" "0.07" "0.08" "0.09" "0.11" "0.07"
## SEV "0.15" "0.05" "0.08" "0.05" "0.05" "0.12" "0.07" "0.14" "0.17" "0.12"
## ALU "0.11" "0.07" "0.09" "0.09" "0.08" "0.08" "0.06" "0.10" "0.13" "0.08"
## BAR "0.13" "0.14" "0.16" "0.14" "0.14" "0.04" "0.11" "0.12" "0.11" "0.09"
## SPM "0.02" "0.11" "0.13" "0.12" "0.12" "0.06" "0.06" "0.01" "0.06" "0.04"
## KER "0.10" "0.05" "0.07" "0.05" "0.05" "0.08" "0.04" "0.09" "0.13" "0.07"
## KRA "0.10" "0.01" "0.04" "0.02" "0.01" "0.08" "0.02" "0.09" "0.13" "0.07"
## SPC NA     "0.12" "0.15" "0.13" "0.14" "0.08" "0.07" "0.01" "0.13" "0.05"
## SOC "3346" NA     "0.06" "0.01" "0.02" "0.10" "0.04" "0.12" "0.15" "0.10"
## TIK "3358" "253"  NA     "0.05" "0.05" "0.12" "0.07" "0.14" "0.17" "0.12"
## GES "3445" "109"  "315"  NA     "0.03" "0.11" "0.06" "0.12" "0.16" "0.11"
## RAR "3442" "187"  "127"  "210"  NA     "0.11" "0.05" "0.13" "0.16" "0.11"
## TUH "3519" "283"  "508"  "194"  "398"  NA     "0.07" "0.08" "0.09" "0.05"
## ARM "3835" "538"  "687"  "429"  "560"  "316"  NA     "0.06" "0.12" "0.05"
## SPB "571"  "3895" "3893" "3995" "3983" "4076" "4392" NA     "0.13" "0.05"
## SPS "567"  "3892" "3913" "3988" "3995" "4054" "4370" "332"  NA     "0.09"
## POP "702"  "3906" "3885" "4010" "3981" "4104" "4417" "283"  "615"  NA

Make a table and save it as a word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]


# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df)) 

ft

Population

CRO

ALD

ITP

TIR

SER

ALV

CES

ITR

ROM

SLO

ITB

BUL

DES

TRE

BRE

SIC

ROS

IMP

GRA

TUA

FRS

MAL

STS

GRC

SEV

ALU

BAR

SPM

KER

KRA

SPC

SOC

TIK

GES

RAR

TUH

ARM

SPB

SPS

POP

CRO

0.05

0.05

0.02

0.17

0.03

0.18

0.05

0.05

0.06

0.09

0.09

0.13

0.05

0.15

0.08

0.10

0.08

0.06

0.08

0.06

0.06

0.06

0.05

0.14

0.09

0.09

0.08

0.10

0.09

0.10

0.12

0.13

0.12

0.12

0.07

0.08

0.09

0.11

0.06

ALD

180

0.08

0.03

0.22

0.02

0.20

0.08

0.10

0.08

0.13

0.11

0.16

0.08

0.17

0.11

0.12

0.11

0.08

0.10

0.09

0.08

0.10

0.08

0.16

0.12

0.12

0.10

0.12

0.11

0.13

0.14

0.16

0.15

0.15

0.09

0.11

0.12

0.14

0.09

ITP

201

224

0.06

0.17

0.06

0.15

0.02

0.02

0.04

0.07

0.06

0.11

0.03

0.12

0.04

0.08

0.05

0.05

0.06

0.04

0.02

0.04

0.06

0.12

0.08

0.08

0.05

0.08

0.08

0.08

0.10

0.12

0.11

0.11

0.04

0.07

0.07

0.09

0.04

TIR

196

27

251

0.22

0.00

0.21

0.06

0.07

0.06

0.11

0.10

0.15

0.06

0.17

0.10

0.12

0.10

0.06

0.08

0.07

0.07

0.08

0.06

0.15

0.10

0.10

0.10

0.11

0.10

0.11

0.13

0.15

0.14

0.14

0.07

0.09

0.10

0.13

0.07

SER

322

442

520

439

0.18

0.28

0.16

0.21

0.15

0.22

0.19

0.23

0.15

0.25

0.19

0.20

0.22

0.19

0.20

0.16

0.16

0.17

0.19

0.25

0.20

0.22

0.21

0.20

0.20

0.22

0.23

0.25

0.23

0.23

0.17

0.19

0.20

0.24

0.17

ALV

260

93

235

99

534

0.18

0.06

0.07

0.06

0.10

0.09

0.14

0.06

0.16

0.09

0.10

0.08

0.06

0.08

0.07

0.06

0.07

0.05

0.14

0.10

0.10

0.08

0.10

0.10

0.11

0.12

0.14

0.13

0.13

0.07

0.09

0.10

0.11

0.07

CES

514

673

506

696

612

724

0.17

0.19

0.14

0.17

0.19

0.14

0.11

0.16

0.16

0.17

0.18

0.18

0.19

0.15

0.15

0.15

0.19

0.23

0.19

0.21

0.14

0.18

0.18

0.15

0.21

0.22

0.21

0.22

0.16

0.16

0.13

0.22

0.13

ITR

477

585

371

612

697

605

252

0.01

0.04

0.07

0.05

0.12

0.04

0.14

0.05

0.08

0.05

0.05

0.04

0.04

0.02

0.04

0.06

0.12

0.07

0.07

0.06

0.08

0.07

0.09

0.10

0.12

0.10

0.10

0.03

0.07

0.08

0.07

0.03

ROM

477

585

371

612

697

605

252

0

0.04

0.08

0.07

0.14

0.03

0.15

0.06

0.09

0.06

0.05

0.06

0.04

0.02

0.04

0.06

0.13

0.08

0.08

0.07

0.09

0.08

0.09

0.11

0.13

0.11

0.12

0.04

0.08

0.08

0.09

0.05

SLO

509

689

585

705

476

761

227

455

455

0.06

0.07

0.08

0.02

0.10

0.05

0.06

0.05

0.05

0.06

0.04

0.04

0.04

0.06

0.11

0.07

0.08

0.04

0.07

0.07

0.06

0.09

0.11

0.10

0.10

0.04

0.05

0.05

0.09

0.04

ITB

591

752

583

775

672

802

79

302

302

245

0.10

0.11

0.05

0.12

0.08

0.10

0.08

0.10

0.11

0.06

0.06

0.07

0.10

0.16

0.11

0.13

0.07

0.11

0.11

0.09

0.13

0.16

0.14

0.15

0.08

0.09

0.08

0.13

0.05

BUL

428

414

604

393

316

483

881

899

899

782

950

0.14

0.06

0.15

0.08

0.10

0.09

0.09

0.08

0.07

0.06

0.08

0.09

0.15

0.10

0.11

0.08

0.11

0.10

0.10

0.13

0.15

0.13

0.14

0.07

0.09

0.10

0.12

0.07

DES

691

859

702

881

725

915

198

427

427

255

126

1021

0.07

0.09

0.09

0.12

0.13

0.14

0.14

0.11

0.11

0.11

0.14

0.19

0.15

0.17

0.09

0.14

0.14

0.10

0.17

0.18

0.17

0.18

0.12

0.11

0.08

0.17

0.08

TRE

684

858

717

877

683

920

230

476

476

206

176

986

80

0.09

0.04

0.05

0.04

0.06

0.06

0.03

0.03

0.03

0.06

0.12

0.07

0.08

0.03

0.07

0.07

0.05

0.10

0.11

0.10

0.11

0.05

0.05

0.04

0.08

0.03

BRE

719

887

728

908

752

942

223

445

445

280

148

1048

28

92

0.12

0.14

0.13

0.15

0.16

0.12

0.12

0.12

0.15

0.20

0.16

0.18

0.11

0.15

0.15

0.12

0.18

0.19

0.18

0.19

0.13

0.13

0.10

0.18

0.11

SIC

537

480

340

502

858

420

715

483

483

865

779

894

905

945

925

0.09

0.07

0.08

0.08

0.06

0.05

0.06

0.09

0.14

0.10

0.10

0.06

0.10

0.09

0.09

0.12

0.14

0.12

0.13

0.06

0.08

0.08

0.11

0.05

ROS

683

771

884

761

367

859

917

1047

1047

726

961

445

977

915

1000

1219

0.09

0.10

0.10

0.07

0.07

0.08

0.10

0.16

0.11

0.13

0.09

0.11

0.11

0.10

0.13

0.15

0.14

0.14

0.08

0.09

0.09

0.13

0.07

IMP

840

984

787

1009

949

1020

341

428

428

507

277

1221

268

346

254

891

1232

0.08

0.08

0.05

0.04

0.05

0.09

0.15

0.09

0.11

0.07

0.09

0.09

0.08

0.12

0.15

0.13

0.13

0.06

0.08

0.07

0.10

0.05

GRA

711

534

705

515

884

476

1200

1070

1070

1220

1279

656

1390

1391

1417

736

1100

1492

0.07

0.07

0.05

0.07

0.02

0.13

0.08

0.09

0.08

0.09

0.08

0.10

0.11

0.13

0.11

0.12

0.06

0.08

0.09

0.10

0.07

TUA

851

694

901

669

932

667

1363

1271

1271

1340

1441

641

1542

1529

1569

994

1057

1678

277

0.07

0.05

0.07

0.08

0.14

0.09

0.06

0.08

0.10

0.09

0.10

0.12

0.14

0.12

0.13

0.05

0.09

0.10

0.03

0.06

FRS

1038

1192

1004

1216

1100

1234

525

654

654

629

448

1393

375

428

349

1119

1341

228

1709

1885

0.04

0.03

0.07

0.12

0.08

0.09

0.05

0.08

0.08

0.07

0.10

0.12

0.11

0.11

0.05

0.06

0.06

0.10

0.04

MAL

817

746

621

764

1139

674

943

694

694

1118

996

1157

1119

1170

1135

280

1498

1049

873

1148

1270

0.04

0.06

0.12

0.07

0.08

0.05

0.08

0.07

0.07

0.10

0.12

0.10

0.11

0.04

0.06

0.06

0.07

0.03

STS

1053

1230

1098

1248

989

1297

604

835

835

546

536

1304

410

381

390

1314

1125

528

1764

1886

412

1523

0.07

0.13

0.09

0.09

0.05

0.08

0.08

0.07

0.11

0.13

0.11

0.12

0.05

0.07

0.06

0.10

0.04

GRC

934

754

884

742

1141

679

1389

1224

1224

1440

1467

925

1585

1596

1611

812

1370

1653

270

445

1877

862

1975

0.13

0.09

0.10

0.08

0.10

0.09

0.10

0.12

0.14

0.12

0.12

0.07

0.08

0.09

0.11

0.07

SEV

1250

1196

1410

1170

1082

1235

1689

1723

1723

1551

1753

824

1804

1756

1830

1652

894

2029

1086

843

2180

1878

2012

1288

0.10

0.16

0.14

0.07

0.03

0.15

0.05

0.08

0.05

0.05

0.12

0.07

0.14

0.17

0.12

ALU

1321

1268

1482

1242

1149

1307

1757

1794

1794

1617

1821

895

1870

1821

1896

1724

950

2097

1151

904

2246

1948

2072

1348

72

0.11

0.09

0.06

0.05

0.11

0.07

0.09

0.09

0.08

0.08

0.06

0.10

0.13

0.08

BAR

1333

1446

1227

1473

1489

1458

878

862

862

1059

824

1742

815

887

796

1195

1783

552

1900

2123

512

1233

916

2007

2561

2631

0.10

0.12

0.11

0.13

0.14

0.16

0.14

0.14

0.04

0.11

0.12

0.11

0.09

SPM

1360

1450

1227

1478

1555

1448

956

883

883

1160

916

1782

933

1011

919

1136

1873

665

1864

2103

684

1128

1096

1944

2606

2677

211

0.09

0.08

0.02

0.11

0.13

0.12

0.12

0.06

0.06

0.01

0.06

0.04

KER

1490

1443

1655

1417

1301

1484

1913

1960

1960

1762

1973

1062

2017

1964

2042

1901

1073

2250

1327

1075

2391

2127

2197

1517

250

179

2789

2841

0.03

0.10

0.05

0.07

0.05

0.05

0.08

0.04

0.09

0.13

0.07

KRA

1687

1633

1848

1607

1506

1670

2117

2160

2160

1968

2178

1261

2222

2170

2248

2084

1276

2455

1478

1214

2596

2301

2401

1648

438

366

2992

3042

206

0.10

0.01

0.04

0.02

0.01

0.08

0.02

0.09

0.13

0.07

SPC

1601

1698

1474

1725

1781

1698

1173

1124

1124

1363

1125

2020

1122

1196

1104

1387

2086

857

2116

2354

818

1368

1214

2193

2843

2913

310

252

3075

3277

0.12

0.15

0.13

0.14

0.08

0.07

0.01

0.13

0.05

SOC

1748

1678

1898

1651

1591

1706

2197

2224

2224

2060

2262

1327

2313

2264

2339

2113

1388

2537

1470

1198

2689

2316

2512

1618

509

443

3068

3107

325

161

3346

0.06

0.01

0.02

0.10

0.04

0.12

0.15

0.10

TIK

1778

1733

1945

1707

1580

1774

2191

2247

2247

2033

2250

1350

2288

2233

2313

2190

1328

2527

1600

1339

2660

2414

2447

1776

539

467

3069

3126

290

132

3358

253

0.05

0.05

0.12

0.07

0.14

0.17

0.12

GES

1845

1770

1991

1743

1695

1795

2300

2321

2321

2166

2365

1426

2418

2370

2444

2197

1496

2640

1540

1265

2794

2395

2620

1675

614

550

3168

3204

434

257

3445

109

315

0.03

0.11

0.06

0.12

0.16

0.11

RAR

1853

1798

2014

1772

1670

1833

2281

2326

2326

2130

2342

1427

2385

2331

2410

2246

1432

2619

1629

1361

2758

2460

2555

1789

604

532

3157

3208

369

166

3442

187

127

210

0.11

0.05

0.13

0.16

0.11

TUH

1919

1828

2051

1801

1797

1844

2393

2395

2395

2272

2461

1512

2522

2478

2549

2233

1629

2734

1544

1267

2897

2412

2745

1650

734

679

3251

3275

597

443

3519

283

508

194

398

0.07

0.08

0.09

0.05

ARM

2235

2144

2367

2117

2107

2159

2706

2711

2711

2581

2773

1826

2832

2787

2859

2545

1923

3047

1848

1571

3207

2719

3045

1937

1034

974

3566

3591

862

674

3835

538

687

429

560

316

0.06

0.12

0.05

SPB

2158

2263

2040

2290

2313

2266

1702

1683

1683

1869

1644

2569

1617

1682

1595

1957

2594

1367

2687

2925

1264

1923

1601

2760

3388

3457

827

823

3614

3818

571

3895

3893

3995

3983

4076

4392

0.13

0.05

SPS

2144

2226

2002

2253

2341

2215

1737

1669

1669

1930

1690

2568

1689

1761

1670

1865

2651

1424

2601

2855

1371

1785

1750

2643

3393

3464

874

787

3628

3829

567

3892

3913

3988

3995

4054

4370

332

0.09

POP

2197

2320

2103

2347

2316

2335

1711

1734

1734

1852

1645

2591

1597

1653

1572

2065

2566

1370

2777

3001

1225

2067

1509

2876

3397

3464

878

939

3614

3819

702

3906

3885

4010

3981

4104

4417

283

615

# Create a new Word document
doc <- read_docx()

# Add the flextable to the Word document
doc <- body_add_flextable(doc, ft)

# Save the Word document
print(doc, target =  here("output", "europe", "fst", "r_01", "Europe_r01_LD2_flextable.docx"))

4. Mantel test with Europe (SNP Set 1)

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01 \
--keep-fam output/fst/pops_4fst.txt \
--make-bed \
--out output/fst/mantel01 \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel01.log

17028 variants loaded from .bim file. –keep-fam: 407 people remaining. Total genotyping rate in remaining samples is 0.970158. 17028 variants and 407 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel01 \
--recodeA \
--out output/fst/mantel01 \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel01.log

17028 variants loaded from .bim file. 17028 variants and 407 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel01.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "output", "europe", "fst", "r_01", "europe_r01_LD2.rds"
))

Load it

albo2 <- readRDS(here(
  "output", "europe", "fst", "r_01", "europe_r01_LD2.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
head(sampling_loc)
##       Pop_City    Country  Latitude Longitude Continent Abbreviation
## 1  Franceville      Gabon  -1.59207  13.53242    Africa          GAB
## 2 Antananarivo Madagascar -18.87920  47.50790    Africa          ANT
## 3  Diego ville Madagascar -12.27361  49.29372    Africa          DGV
## 4    Morondava Madagascar -20.28420  44.27940    Africa          MAD
## 5     Vohimasy Madagascar -22.81591  47.75026    Africa          VOH
## 6      Dauguet  Mauritius -20.18530  57.52154    Africa          DAU
##          Year         Region    Subregion order
## 1        2015 Central Africa       Africa    72
## 2        2022    East Africa  East Africa    76
## 3        2022    East Africa  East Africa    77
## 4        2016    East Africa  East Africa    78
## 5 2016 & 2017    East Africa  East Africa    79
## 6        2022   Indian Ocean Indian Ocean    80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      SOC         1065          0          0   0        -9    Sochi  Russia
## 2      SOC         1066          0          0   0        -9    Sochi  Russia
## 3      SOC         1067          0          0   0        -9    Sochi  Russia
## 4      SOC         1068          0          0   0        -9    Sochi  Russia
## 5      SOC         1069          0          0   0        -9    Sochi  Russia
## 6      SOC         1070          0          0   0        -9    Sochi  Russia
##   Latitude Longitude Continent Year         Region   Subregion order
## 1 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 2 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 3 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 4 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 5 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38
## 6 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    38

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 43.60042 39.74533
## [2,] 43.60042 39.74533
## [3,] 43.60042 39.74533
## [4,] 43.60042 39.74533
## [5,] 43.60042 39.74533
## [6,] 43.60042 39.74533
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 43.59886 39.74551
## [2,] 43.59906 39.74661
## [3,] 43.60204 39.74453
## [4,] 43.60060 39.74656
## [5,] 43.60100 39.74589
## [6,] 43.59957 39.74666

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "output", "europe", "fst", "r_01", "europe_01_LD2.rds"
  )
)

4.1 Isolation By Distance for Europe (SNP Set 1)

# Calculate distances
# Genetic
Dgen <- dist(albo2$tab)

# Physical
Dgeo <- dist(other(albo2)$xy)

# Run the Mantel test is a statistical test to compare two matrices and check if they are correlated. In this context, it's used to check if genetic distance (Dgen) correlates with geographic distance (Dgeo). The .randtest variant performs a randomized version of the test to estimate the significance of the observed correlation.
ibd <- mantel.randtest(Dgen,Dgeo)

# Check it
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: 0.36077 
## 
## Based on 999 replicates
## Simulated p-value: 0.001 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## 2.845410e+01 4.867888e-04 1.603240e-04

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "r_01", "simIBD_europe_r01_LD2.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Add the equation

# Plotting the data
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
# You can adjust the position (e.g., x and y values) as necessary
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.7, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.65, labels = r2_label)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "r_01", "Genetic_v_Geog_distance_europe_r01_LD2.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)


dev.off()

Use library MASS for plot

CairoPDF(here(
     "output", "europe", "fst", "r_01", "IDB_PlotFromMASS_europe_r01_density.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)


title("Isolation by distance")

dev.off()
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.5, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.45, labels = r2_label)

title("Isolation by distance")

Check the populations - I did not include those with less than 4 mosquitoes

summary(albo2$pop)
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ITB ITP ITR KER KRA 
##  10  12  12  10  12  13  10  14  12  16  12  12  11  10   4   5   8  12  12  12 
## MAL POP RAR ROM ROS SER SEV SIC SLO SOC SPB SPC SPM SPS STS TIK TIR TRE TUA TUH 
##  12  12  12   4  11   4  12   9  12  12   8   6   5   8  12  12   4  12   9  12

Fst for MAF 1% filtered SNP set (Set 3)

1. Estimate Fst for European populations with at least 4 individuals using SNP Set 3 (r2<0.01, MAF <1% removed)

Create list of populations

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

awk '{print $1}' output/snps_sets/r2_0.01_b.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4bfst.txt;
head  output/fst/pops_4bfst.txt;
wc -l output/fst/pops_4bfst.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## 40 output/fst/pops_4bfst.txt

We have 40 populations with 4 or more mosquitoes.

1.1 Convert to raw format and subset the bed file

First load plink

module load PLINK/1.9b_6.21-x86_64
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/pops_4bfst.txt \
--recodeA \
--out output/fst/r2_0.01_b \
--silent;
grep 'samples\|variants\|remaining' output/fst/r2_0.01_b.log

20968 variants loaded from .bim file. –keep-fam: 408 people remaining. Total genotyping rate in remaining samples is 0.971028. 20968 variants and 408 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

LD2 <-
  read.PLINK(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/r2_0.01_b.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )
## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
summary(LD2)
##   Length    Class     Mode 
##        1 genlight       S4

1.2 Convert the genlight object to Stampp format, and estimate pairwide Fst values

The command below would also work, but you can simplify it and put only the numbers: genome_equal_2 <- stamppFst(neutral, nboots=100, percent=95 + nclusters==10)

This chunk will take a couple minutes to run.

# convert
LD2_2 <- stamppConvert(LD2, type="genlight")
# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
LD2_3 <- stamppFst(LD2_2, 1, 95, 1)

Save it

saveRDS(
  LD2_3, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_MAF1.rds"
  )
)

To load it

LD2_3 <- readRDS(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_MAF1.rds"
  )
)

Now lets look at the object

summary(LD2_3)
##       SOC                SEV               GES               KER         
##  Min.   :0.007368   Min.   :0.02984   Min.   :0.01731   Min.   :0.03114  
##  1st Qu.:0.095683   1st Qu.:0.11672   1st Qu.:0.10258   1st Qu.:0.07804  
##  Median :0.114950   Median :0.13445   Median :0.12082   Median :0.09649  
##  Mean   :0.108035   Mean   :0.13179   Mean   :0.11816   Mean   :0.09821  
##  3rd Qu.:0.128378   3rd Qu.:0.15304   3rd Qu.:0.13515   3rd Qu.:0.10979  
##  Max.   :0.226722   Max.   :0.24859   Max.   :0.23290   Max.   :0.20445  
##  NA's   :1          NA's   :2         NA's   :3         NA's   :4        
##       KRA               TIK               RAR               TRE         
##  Min.   :0.01126   Min.   :0.05318   Min.   :0.05336   Min.   :0.02292  
##  1st Qu.:0.07449   1st Qu.:0.12048   1st Qu.:0.11031   1st Qu.:0.03722  
##  Median :0.09153   Median :0.13660   Median :0.12626   Median :0.05341  
##  Mean   :0.09234   Mean   :0.13868   Mean   :0.13086   Mean   :0.05661  
##  3rd Qu.:0.10511   3rd Qu.:0.15285   3rd Qu.:0.14308   3rd Qu.:0.06398  
##  Max.   :0.19658   Max.   :0.25029   Max.   :0.23530   Max.   :0.15403  
##  NA's   :5         NA's   :6         NA's   :7         NA's   :8        
##       ALU               STS               SIC               BRE         
##  Min.   :0.06494   Min.   :0.03010   Min.   :0.04422   Min.   :0.08617  
##  1st Qu.:0.07964   1st Qu.:0.05092   1st Qu.:0.06364   1st Qu.:0.12019  
##  Median :0.09507   Median :0.06948   Median :0.08073   Median :0.13841  
##  Mean   :0.10215   Mean   :0.07361   Mean   :0.08448   Mean   :0.14183  
##  3rd Qu.:0.10886   3rd Qu.:0.07770   3rd Qu.:0.09281   3rd Qu.:0.15641  
##  Max.   :0.19797   Max.   :0.17530   Max.   :0.19317   Max.   :0.24879  
##  NA's   :9         NA's   :10        NA's   :11        NA's   :12       
##       DES               CES              TIR                 IMP         
##  Min.   :0.07761   Min.   :0.1316   Min.   :-0.000934   Min.   :0.04352  
##  1st Qu.:0.10740   1st Qu.:0.1527   1st Qu.: 0.062847   1st Qu.:0.05687  
##  Median :0.12503   Median :0.1755   Median : 0.073205   Median :0.08116  
##  Mean   :0.12773   Mean   :0.1763   Mean   : 0.083119   Mean   :0.08135  
##  3rd Qu.:0.14069   3rd Qu.:0.1882   3rd Qu.: 0.104343   3rd Qu.:0.08647  
##  Max.   :0.22839   Max.   :0.2844   Max.   : 0.222903   Max.   :0.22637  
##  NA's   :13        NA's   :14       NA's   :15          NA's   :16       
##       ROM                GRC               BAR               BUL         
##  Min.   :0.006012   Min.   :0.02418   Min.   :0.04376   Min.   :0.05357  
##  1st Qu.:0.044731   1st Qu.:0.06061   1st Qu.:0.08101   1st Qu.:0.06767  
##  Median :0.066024   Median :0.07681   Median :0.09852   Median :0.09179  
##  Mean   :0.067477   Mean   :0.08195   Mean   :0.10274   Mean   :0.09108  
##  3rd Qu.:0.082312   3rd Qu.:0.09720   3rd Qu.:0.12083   3rd Qu.:0.10032  
##  Max.   :0.207815   Max.   :0.18931   Max.   :0.21778   Max.   :0.19614  
##  NA's   :17         NA's   :18        NA's   :19        NA's   :20       
##       CRO               GRA               ITB               MAL         
##  Min.   :0.02773   Min.   :0.04970   Min.   :0.05193   Min.   :0.01802  
##  1st Qu.:0.05615   1st Qu.:0.05852   1st Qu.:0.06888   1st Qu.:0.03876  
##  Median :0.06531   Median :0.07608   Median :0.08580   Median :0.06135  
##  Mean   :0.07608   Mean   :0.08161   Mean   :0.09276   Mean   :0.05974  
##  3rd Qu.:0.09071   3rd Qu.:0.09589   3rd Qu.:0.10242   3rd Qu.:0.07366  
##  Max.   :0.17329   Max.   :0.18592   Max.   :0.21971   Max.   :0.16130  
##  NA's   :21        NA's   :22        NA's   :23        NA's   :24       
##       SPM                TUA               TUH               ALD         
##  Min.   :0.006262   Min.   :0.02844   Min.   :0.03486   Min.   :0.02204  
##  1st Qu.:0.056700   1st Qu.:0.05760   1st Qu.:0.04529   1st Qu.:0.08084  
##  Median :0.071302   Median :0.07320   Median :0.06970   Median :0.09978  
##  Mean   :0.077542   Mean   :0.08055   Mean   :0.07255   Mean   :0.10620  
##  3rd Qu.:0.093710   3rd Qu.:0.09974   3rd Qu.:0.08458   3rd Qu.:0.12480  
##  Max.   :0.213896   Max.   :0.19864   Max.   :0.17311   Max.   :0.21576  
##  NA's   :25         NA's   :26        NA's   :27        NA's   :28       
##       FRS               ITP               POP               ROS         
##  Min.   :0.03937   Min.   :0.02323   Min.   :0.03491   Min.   :0.05906  
##  1st Qu.:0.04234   1st Qu.:0.04179   1st Qu.:0.05167   1st Qu.:0.08919  
##  Median :0.06316   Median :0.06778   Median :0.05496   Median :0.09704  
##  Mean   :0.06904   Mean   :0.07107   Mean   :0.07007   Mean   :0.10790  
##  3rd Qu.:0.07371   3rd Qu.:0.07952   3rd Qu.:0.07260   3rd Qu.:0.10967  
##  Max.   :0.16404   Max.   :0.17461   Max.   :0.16508   Max.   :0.20308  
##  NA's   :29        NA's   :30        NA's   :31        NA's   :32       
##       SER              SLO               SPC               SPB         
##  Min.   :0.1529   Min.   :0.04127   Min.   :0.00711   Min.   :0.06086  
##  1st Qu.:0.1745   1st Qu.:0.05380   1st Qu.:0.07035   1st Qu.:0.07552  
##  Median :0.1902   Median :0.05741   Median :0.08783   Median :0.08804  
##  Mean   :0.1930   Mean   :0.06074   Mean   :0.08176   Mean   :0.09127  
##  3rd Qu.:0.2100   3rd Qu.:0.06194   3rd Qu.:0.10685   3rd Qu.:0.10379  
##  Max.   :0.2386   Max.   :0.09230   Max.   :0.13664   Max.   :0.12813  
##  NA's   :33       NA's   :34        NA's   :35        NA's   :36       
##       SPS               ARM               ALV               ITR     
##  Min.   :0.07152   Min.   :0.06985   Min.   :0.05703   Min.   : NA  
##  1st Qu.:0.09248   1st Qu.:0.07432   1st Qu.:0.05703   1st Qu.: NA  
##  Median :0.11345   Median :0.07880   Median :0.05703   Median : NA  
##  Mean   :0.10059   Mean   :0.07880   Mean   :0.05703   Mean   :NaN  
##  3rd Qu.:0.11513   3rd Qu.:0.08328   3rd Qu.:0.05703   3rd Qu.: NA  
##  Max.   :0.11682   Max.   :0.08776   Max.   :0.05703   Max.   : NA  
##  NA's   :37        NA's   :38        NA's   :39        NA's   :40

If you want you can save the fst values as csv.

# Convert to data frame
LD2_df <- data.frame(LD2_3)
# Save it
write.csv(LD2_df, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_df_MAF1.csv"))

Check the Fst values

head(LD2_df)
##             SOC        SEV        GES        KER       KRA TIK RAR TRE ALU STS
## SOC          NA         NA         NA         NA        NA  NA  NA  NA  NA  NA
## SEV 0.044764659         NA         NA         NA        NA  NA  NA  NA  NA  NA
## GES 0.011269778 0.05366015         NA         NA        NA  NA  NA  NA  NA  NA
## KER 0.046807291 0.06763399 0.05416414         NA        NA  NA  NA  NA  NA  NA
## KRA 0.007367688 0.02983546 0.01730859 0.03114523        NA  NA  NA  NA  NA  NA
## TIK 0.057321524 0.07850480 0.05231734 0.07125463 0.0352695  NA  NA  NA  NA  NA
##     SIC BRE DES CES TIR IMP ROM GRC BAR BUL CRO GRA ITB MAL SPM TUA TUH ALD FRS
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## SEV  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KER  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KRA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TIK  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
##     ITP POP ROS SER SLO SPC SPB SPS ARM ALV ITR
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## SEV  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KER  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## KRA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## TIK  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Convert the data into a matrix.

aa <- as.matrix(LD2_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##             SOC        SEV        GES        KER         KRA        TIK
## SOC          NA 0.04476466 0.01126978 0.04680729 0.007367688 0.05732152
## SEV 0.044764659         NA 0.05366015 0.06763399 0.029835464 0.07850480
## GES 0.011269778 0.05366015         NA 0.05416414 0.017308587 0.05231734
## KER 0.046807291 0.06763399 0.05416414         NA 0.031145234 0.07125463
## KRA 0.007367688 0.02983546 0.01730859 0.03114523          NA 0.03526950
## TIK 0.057321524 0.07850480 0.05231734 0.07125463 0.035269501         NA
##            RAR        TRE        ALU        STS        SIC       BRE       DES
## SOC 0.02162356 0.09677667 0.07010602 0.10591967 0.11784326 0.1783248 0.1653535
## SEV 0.05301801 0.11534477 0.09442672 0.12611348 0.14011854 0.1957788 0.1837507
## GES 0.02837824 0.10102216 0.08513146 0.10927914 0.12290051 0.1813135 0.1691206
## KER 0.05279426 0.07229226 0.06560035 0.08193350 0.09777071 0.1534374 0.1390736
## KRA 0.01126164 0.07169993 0.05454288 0.08239319 0.09220277 0.1519838 0.1397541
## TIK 0.05317902 0.11330313 0.09008958 0.12334852 0.13737947 0.1945549 0.1803734
##           CES       TIR        IMP        ROM        GRC       BAR       BUL
## SOC 0.2093281 0.1284323 0.12099490 0.10667493 0.11671692 0.1360204 0.1283236
## SEV 0.2267463 0.1529011 0.14651785 0.13206617 0.13376653 0.1575244 0.1491057
## GES 0.2119834 0.1351532 0.12748623 0.11182282 0.12208159 0.1427050 0.1338410
## KER 0.1814383 0.1104057 0.09219116 0.08795374 0.10017423 0.1237427 0.1095903
## KRA 0.1823634 0.1017700 0.09264869 0.08159695 0.09017657 0.1108940 0.1032033
## TIK 0.2237019 0.1538818 0.14536390 0.13186905 0.13432670 0.1539908 0.1478755
##           CRO        GRA       ITB        MAL        SPM        TUA        TUH
## SOC 0.1149502 0.10889177 0.1337168 0.09458982 0.12205750 0.11826743 0.10435177
## SEV 0.1351410 0.12765441 0.1570420 0.11656423 0.14762503 0.13848504 0.12249588
## GES 0.1203672 0.11376844 0.1389219 0.10125102 0.12971049 0.12401729 0.11137798
## KER 0.0996719 0.09520248 0.1085958 0.07559881 0.09938746 0.10311247 0.08539267
## KRA 0.0918335 0.08437280 0.1070244 0.07285346 0.09393310 0.09124916 0.07816345
## TIK 0.1325973 0.12705980 0.1559944 0.11459438 0.14348939 0.13653641 0.12161918
##           ALD        FRS        ITP        POP       ROS       SER        SLO
## SOC 0.1393253 0.10279814 0.10070244 0.09884633 0.1342376 0.2267217 0.09140753
## SEV 0.1592704 0.12144167 0.12209602 0.11938424 0.1537391 0.2485867 0.10967112
## GES 0.1445257 0.10779885 0.10598420 0.10461590 0.1391201 0.2329000 0.09665973
## KER 0.1249993 0.07884778 0.08100376 0.07324363 0.1106383 0.2044508 0.07291814
## KRA 0.1135568 0.07658576 0.07497710 0.07400209 0.1075813 0.1965758 0.06479207
## TIK 0.1586231 0.12035512 0.12086377 0.11777636 0.1497754 0.2502909 0.10749679
##            SPC        SPB       SPS        ARM        ALV        ITR
## SOC 0.12576249 0.11561471 0.1508072 0.04507389 0.11809185 0.09717221
## SEV 0.15308239 0.14034768 0.1720481 0.07105325 0.13842989 0.11718572
## GES 0.13312607 0.12082033 0.1581405 0.05626819 0.12396762 0.10257979
## KER 0.10263316 0.09346009 0.1332609 0.04022032 0.10266884 0.07947360
## KRA 0.09933254 0.09153356 0.1250888 0.01891726 0.09497027 0.07282745
## TIK 0.14746567 0.13712217 0.1701660 0.06736327 0.13665857 0.11595669

Import sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))


# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    order
  )

# Check it
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##   [1] "BER" "COL" "PAL" "HOU" "LOS" "MAU" "REC" "GRV" "FRS" "STS" "POP" "POL"
##  [13] "SPB" "SPS" "SPC" "BAR" "SPM" "IMP" "ITG" "BRE" "DES" "TRE" "ITB" "CES"
##  [25] "ROM" "ITR" "SIC" "ITP" "MAL" "SLO" "CRO" "ALV" "ALD" "TIR" "SER" "GRA"
##  [37] "GRC" "ROS" "BUL" "TUA" "TUH" "SEV" "ALU" "KER" "KRA" "SOC" "TIK" "RAR"
##  [49] "GES" "ARM" "KAN" "UTS" "KAG" "OKI" "HAI" "YUN" "HUN" "TAI" "GEL" "BEN"
##  [61] "KUN" "KAT" "JAF" "CAM" "SUF" "SUU" "INW" "INJ" "KLP" "MAT" "SSK" "KAC"
##  [73] "SON" "CHA" "LAM" "HAN" "HOC" "QNC" "LIB" "MAD" "TRO" "DAU" "JAT" "YAT"
##  [85] "GAB" "ANT" "DGV" "VOH" "RAB" "YAO" "AWK" "BRM" "JAM" "SAI" "BEA" "CHI"
##  [97] "DAL" "FAY" "MAC" "MAN" "NEO" "NEW" "NUE" "PEO" "RUS" "SPR" "POR" "NOV"
## [109] "TUC" "MED" "AIZ" "HIR" "KHO" "KYO" "NAG" "NIG" "SAG" "SAK" "SEN" "TAN"
LD2_df_ordered <- read_csv(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_df_odered.csv"
)
## New names:
## Rows: 40 Columns: 41
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (40): FRS, STS, POP, SPB, SPS, SPC, BAR, SPM, IMP, BRE, DES, TRE,
## ITB, C...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Below doesnt change order Create vector with order of populations

# Extract the populations that appear in LD2_df
populations_in_LD2 <- colnames(LD2_df_ordered)

# Reorder the populations based on order_pops
poporder <- populations_in_LD2[populations_in_LD2 %in% order_pops]

#LD2_df[match(poporder, LD2_df$Abbreviation),] #this also doesn't reorder it

# Print the reordered populations
print(poporder)
##  [1] "FRS" "STS" "POP" "SPB" "SPS" "SPC" "BAR" "SPM" "IMP" "BRE" "DES" "TRE"
## [13] "ITB" "CES" "ROM" "ITR" "SIC" "ITP" "MAL" "SLO" "CRO" "ALV" "ALD" "TIR"
## [25] "SER" "GRA" "GRC" "ROS" "BUL" "TUA" "TUH" "SEV" "ALU" "KER" "KRA" "SOC"
## [37] "TIK" "RAR" "GES" "ARM"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[poporder, poporder]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1           Var2          value        
##  FRS    :  40   FRS    :  40   Min.   :-0.0009  
##  STS    :  40   STS    :  40   1st Qu.: 0.0668  
##  POP    :  40   POP    :  40   Median : 0.0946  
##  SPB    :  40   SPB    :  40   Mean   : 0.1004  
##  SPS    :  40   SPS    :  40   3rd Qu.: 0.1261  
##  SPC    :  40   SPC    :  40   Max.   : 0.2844  
##  (Other):1360   (Other):1360   NA's   :820

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 2) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

Save it

ggsave(
  filename = here("output", "europe", "fst", "fst_matrix_europe_LD2_ordered_MAF1.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

1.3 Fst by country for Europe SNP Set 3 dataset

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(LD2_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)
#save the fst values as csv
write.csv(fst_summary, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/fst_summary_MAF1.csv"))
fst_summary_ordered <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/fst_summary_ordered_MAF1.txt")
#fst_summary_ordered <- data.frame(fst_summary_ordered)
fst_summary_ordered <- read.table(fst_summary_ordered,
                                  header=TRUE)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary_ordered, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##             Albania    Armenia   Bulgaria    Croatia     France    Georgia
## Albania          NA 0.09394606 0.10094454 0.02819142 0.08151350 0.13454883
## Armenia  0.09394606         NA 0.09585657 0.08475669 0.06544647 0.05626819
## Bulgaria 0.10094454 0.09585657         NA 0.08722725 0.07492144 0.13384104
## Croatia  0.02819142 0.08475669 0.08722725         NA 0.06302575 0.12036716
## France   0.08151350 0.06544647 0.07492144 0.06302575         NA 0.10853899
## Georgia  0.13454883 0.05626819 0.13384104 0.12036716 0.10853899         NA
## Greece   0.06201272 0.08205863 0.09300545 0.05544159 0.07064597 0.11792501
## Italy    0.09997730 0.08203692 0.09055454 0.08407101 0.07093337 0.13731351
## Malta    0.06886598 0.06332318 0.05679897 0.05500955 0.03897441 0.10125102
## Portugal 0.07546730 0.05495953 0.06636208 0.06402375 0.04227483 0.10461590
## Romania  0.11103043 0.09451032 0.10293539 0.09925514 0.07490661 0.13912009
## Russia   0.12964785 0.04617807 0.12924704 0.11541609 0.10455781 0.02196892
## Serbia   0.20173826 0.19022765 0.19614135 0.17328819 0.16966823 0.23289996
## Slovenia 0.06612389 0.05361430 0.06530144 0.05729126 0.04390333 0.09665973
## Spain    0.11175243 0.08458340 0.10696192 0.09537204 0.07523979 0.13690047
## Turkey   0.07899673 0.07867139 0.07568412 0.07012296 0.05986011 0.11769763
## Ukraine  0.12249231 0.05873850 0.12119872 0.10928968 0.09552548 0.06165397
##              Greece      Italy      Malta   Portugal    Romania     Russia
## Albania  0.06201272 0.09997730 0.06886598 0.07546730 0.11103043 0.12964785
## Armenia  0.08205863 0.08203692 0.06332318 0.05495953 0.09451032 0.04617807
## Bulgaria 0.09300545 0.09055454 0.05679897 0.06636208 0.10293539 0.12924704
## Croatia  0.05544159 0.08407101 0.05500955 0.06402375 0.09925514 0.11541609
## France   0.07064597 0.07093337 0.03897441 0.04227483 0.07490661 0.10455781
## Georgia  0.11792501 0.13731351 0.10125102 0.10461590 0.13912009 0.02196892
## Greece           NA 0.08917035 0.05472746 0.06798589 0.10150763 0.11254358
## Italy    0.08917035         NA 0.04641674 0.04923588 0.09169809 0.13335761
## Malta    0.05472746 0.04641674         NA 0.02975032 0.07357926 0.09679329
## Portugal 0.06798589 0.04923588 0.02975032         NA 0.07259454 0.10008283
## Romania  0.10150763 0.09169809 0.07357926 0.07259454         NA 0.13366776
## Russia   0.11254358 0.13335761 0.09679329 0.10008283 0.13366776         NA
## Serbia   0.18761510 0.19008007 0.16129723 0.16508121 0.20308218 0.22722156
## Slovenia 0.05726520 0.05241481 0.03916767 0.03586147 0.05906033 0.09065635
## Spain    0.09579872 0.09298149 0.07372547 0.06801861 0.10896586 0.13140191
## Turkey   0.07154177 0.06994499 0.04497299 0.05514368 0.09174178 0.11102150
## Ukraine  0.10496079 0.09975371 0.08753557 0.09026470 0.12570020 0.05816269
##             Serbia   Slovenia      Spain     Turkey    Ukraine
## Albania  0.2017383 0.06612389 0.11175243 0.07899673 0.12249231
## Armenia  0.1902276 0.05361430 0.08458340 0.07867139 0.05873850
## Bulgaria 0.1961414 0.06530144 0.10696192 0.07568412 0.12119872
## Croatia  0.1732882 0.05729126 0.09537204 0.07012296 0.10928968
## France   0.1696682 0.04390333 0.07523979 0.05986011 0.09552548
## Georgia  0.2329000 0.09665973 0.13690047 0.11769763 0.06165397
## Greece   0.1876151 0.05726520 0.09579872 0.07154177 0.10496079
## Italy    0.1900801 0.05241481 0.09298149 0.06994499 0.09975371
## Malta    0.1612972 0.03916767 0.07372547 0.04497299 0.08753557
## Portugal 0.1650812 0.03586147 0.06801861 0.05514368 0.09026470
## Romania  0.2030822 0.05906033 0.10896586 0.09174178 0.12570020
## Russia   0.2272216 0.09065635 0.13140191 0.11102150 0.05816269
## Serbia          NA 0.15287729 0.21769108 0.18587606 0.21700387
## Slovenia 0.1528773         NA 0.06748962 0.05168875 0.08308108
## Spain    0.2176911 0.06748962         NA 0.07429569 0.12408194
## Turkey   0.1858761 0.05168875 0.07429569         NA 0.10348110
## Ukraine  0.2170039 0.08308108 0.12408194 0.10348110         NA

1.3.1 Reorder

# Read the file
country_order <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/country_order.txt")
country_order <- read.table(country_order, 
                       header = FALSE,
                       col.names = c("country"))

order_countries <- as.vector(country_order$country)
order_countries
##  [1] "France"   "Portugal" "Spain"    "Italy"    "Malta"    "Slovenia"
##  [7] "Croatia"  "Albania"  "Serbia"   "Greece"   "Romania"  "Bulgaria"
## [13] "Turkey"   "Ukraine"  "Russia"   "Georgia"  "Armenia"
write.csv(symmetric_fst_matrix, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/symmetric_fst_matrix_MAF1.csv"))
symmetric_fst_matrix_ordered <- read_csv(
 "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/symmetric_fst_matrix_ordered_MAF1.csv"
)
## New names:
## Rows: 17 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (17): France, Portugal, Spain, Italy, Malta, Slovenia, Croatia,
## Albania,...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
symmetric_fst_matrix_ordered<- as.data.frame(symmetric_fst_matrix_ordered) #convert to dataframe
rownames(symmetric_fst_matrix_ordered) <- symmetric_fst_matrix_ordered$...1 #use first column as rownames
symmetric_fst_matrix_ordered <- symmetric_fst_matrix_ordered[ -c(1) ] #remove 1st column
symmetric_fst_matrix_ordered <-as.matrix(symmetric_fst_matrix_ordered) #convert to matrix
symmetric_fst_matrix_ordered[lower.tri(symmetric_fst_matrix_ordered)] <- NA
print(symmetric_fst_matrix_ordered)
##          France   Portugal      Spain      Italy      Malta   Slovenia
## France       NA 0.04227483 0.07523979 0.07093337 0.03897441 0.04390333
## Portugal     NA         NA 0.06801861 0.04923588 0.02975032 0.03586147
## Spain        NA         NA         NA 0.09298149 0.07372547 0.06748962
## Italy        NA         NA         NA         NA 0.04641674 0.05241481
## Malta        NA         NA         NA         NA         NA 0.03916767
## Slovenia     NA         NA         NA         NA         NA         NA
## Croatia      NA         NA         NA         NA         NA         NA
## Albania      NA         NA         NA         NA         NA         NA
## Serbia       NA         NA         NA         NA         NA         NA
## Greece       NA         NA         NA         NA         NA         NA
## Romania      NA         NA         NA         NA         NA         NA
## Bulgaria     NA         NA         NA         NA         NA         NA
## Turkey       NA         NA         NA         NA         NA         NA
## Ukraine      NA         NA         NA         NA         NA         NA
## Russia       NA         NA         NA         NA         NA         NA
## Georgia      NA         NA         NA         NA         NA         NA
## Armenia      NA         NA         NA         NA         NA         NA
##             Croatia    Albania    Serbia     Greece    Romania   Bulgaria
## France   0.06302575 0.08151350 0.1696682 0.07064597 0.07490661 0.07492144
## Portugal 0.06402375 0.07546730 0.1650812 0.06798589 0.07259454 0.06636208
## Spain    0.09537205 0.11175243 0.2176911 0.09579872 0.10896586 0.10696192
## Italy    0.08407101 0.09997730 0.1900801 0.08917035 0.09169809 0.09055454
## Malta    0.05500955 0.06886598 0.1612972 0.05472746 0.07357926 0.05679897
## Slovenia 0.05729126 0.06612389 0.1528773 0.05726520 0.05906033 0.06530144
## Croatia          NA 0.02819142 0.1732882 0.05544159 0.09925514 0.08722725
## Albania          NA         NA 0.2017383 0.06201272 0.11103043 0.10094454
## Serbia           NA         NA        NA 0.18761510 0.20308218 0.19614135
## Greece           NA         NA        NA         NA 0.10150763 0.09300545
## Romania          NA         NA        NA         NA         NA 0.10293539
## Bulgaria         NA         NA        NA         NA         NA         NA
## Turkey           NA         NA        NA         NA         NA         NA
## Ukraine          NA         NA        NA         NA         NA         NA
## Russia           NA         NA        NA         NA         NA         NA
## Georgia          NA         NA        NA         NA         NA         NA
## Armenia          NA         NA        NA         NA         NA         NA
##              Turkey    Ukraine     Russia    Georgia    Armenia
## France   0.05986011 0.09552548 0.10455781 0.10853899 0.06544647
## Portugal 0.05514368 0.09026470 0.10008283 0.10461590 0.05495953
## Spain    0.07429569 0.12408194 0.13140191 0.13690047 0.08458340
## Italy    0.06994499 0.09975371 0.13335761 0.13731351 0.08203692
## Malta    0.04497299 0.08753557 0.09679329 0.10125102 0.06332318
## Slovenia 0.05168875 0.08308108 0.09065635 0.09665973 0.05361430
## Croatia  0.07012296 0.10928968 0.11541609 0.12036716 0.08475669
## Albania  0.07899673 0.12249231 0.12964785 0.13454883 0.09394606
## Serbia   0.18587606 0.21700387 0.22722156 0.23289996 0.19022765
## Greece   0.07154177 0.10496079 0.11254358 0.11792501 0.08205863
## Romania  0.09174178 0.12570020 0.13366776 0.13912009 0.09451032
## Bulgaria 0.07568412 0.12119872 0.12924704 0.13384104 0.09585657
## Turkey           NA 0.10348110 0.11102150 0.11769763 0.07867139
## Ukraine          NA         NA 0.05816269 0.06165397 0.05873850
## Russia           NA         NA         NA 0.02196892 0.04617807
## Georgia          NA         NA         NA         NA 0.05626819
## Armenia          NA         NA         NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix_ordered)
summary(pairfst.long2)
##        Var1           Var2         value        
##  France  : 17   France  : 17   Min.   :0.02197  
##  Portugal: 17   Portugal: 17   1st Qu.:0.06541  
##  Spain   : 17   Spain   : 17   Median :0.09041  
##  Italy   : 17   Italy   : 17   Mean   :0.09651  
##  Malta   : 17   Malta   : 17   3rd Qu.:0.11326  
##  Slovenia: 17   Slovenia: 17   Max.   :0.23290  
##  (Other) :187   (Other) :187   NA's   :153

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("output", "europe", "fst", "fst_matrix_Europe_by_country_ordered_MAF1.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 40 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 FRS     0.0754
##  2 STS     0.0787
##  3 POP     0.0711
##  4 SPB     0.0903
##  5 SPS     0.122 
##  6 SPC     0.100 
##  7 BAR     0.114 
##  8 SPM     0.0916
##  9 IMP     0.0923
## 10 BRE     0.146 
## # ℹ 30 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Subregion
  )

# check output
head(fst3)
##               Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1 Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2           Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3             Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4              Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5            San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6            Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
##            Region order order2 orderold   mean_fst
## 1  Western Europe     9      1        1 0.07544418
## 2  Western Europe    10      2        2 0.07874215
## 3 Southern Europe    11      3        3 0.07110287
## 4 Southern Europe    13      5        5 0.09032277
## 5 Southern Europe    14      6        6 0.12206649
## 6 Southern Europe    15      7        7 0.10043809

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##             Region order order2 orderold   mean_fst mean_fst_by_region
## 1   Western Europe     9      1        1 0.07544418               0.08
## 2   Western Europe    10      2        2 0.07874215               0.08
## 3  Southern Europe    11      3        3 0.07110287               0.10
## 4  Southern Europe    13      5        5 0.09032277               0.10
## 5  Southern Europe    14      6        6 0.12206649               0.10
## 6  Southern Europe    15      7        7 0.10043809               0.10
## 7  Southern Europe    16      8        8 0.11431214               0.10
## 8  Southern Europe    17      9        9 0.09158751               0.10
## 9  Southern Europe    18     10       10 0.09227643               0.10
## 10 Southern Europe    20     12       12 0.14626402               0.10
## 11 Southern Europe    21     13       13 0.13081478               0.10
## 12 Southern Europe    22     14       14 0.06377881               0.10
## 13 Southern Europe    23     15       15 0.10269616               0.10
## 14 Southern Europe    24     16       16 0.17817200               0.10
## 15 Southern Europe    25     17       17 0.08057195               0.10
## 16 Southern Europe    26     18       18 0.07113428               0.10
## 17 Southern Europe    27     19       19 0.08929935               0.10
## 18 Southern Europe    28     20       20 0.07221817               0.10
## 19 Southern Europe    29     21       21 0.06905326               0.10
## 20 Southern Europe    30     22       22 0.06755041               0.10
## 21 Southern Europe    31     23       23 0.08731382               0.10
## 22 Southern Europe    32     24       24 0.09104076               0.10
## 23 Southern Europe    33     25       25 0.11233310               0.10
## 24 Southern Europe    34     26       26 0.09919701               0.10
## 25  Eastern Europe    35     27       27 0.20349780               0.11
## 26 Southern Europe    36     28       28 0.08888572               0.10
## 27 Southern Europe    37     29       29 0.09286274               0.10
## 28  Eastern Europe    38     30       30 0.10748444               0.11
## 29  Eastern Europe    39     31       31 0.10234234               0.11
## 30  Eastern Europe    40     32       32 0.09220266               0.11
## 31  Eastern Europe    41     33       33 0.07821586               0.11
## 32  Eastern Europe    42     34       34 0.12956210               0.11
## 33  Eastern Europe    43     35       35 0.09680747               0.11
## 34  Eastern Europe    44     36       36 0.09497907               0.11
## 35  Eastern Europe    45     37       37 0.08506636               0.11
## 36  Eastern Europe    46     38       38 0.10803478               0.11
## 37  Eastern Europe    47     39       39 0.12845411               0.11
## 38  Eastern Europe    48     40       40 0.11637869               0.11
## 39  Eastern Europe    49     41       41 0.11376564               0.11
## 40  Eastern Europe    50     42       42 0.08040068               0.11

Mean By country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##             Region order order2 orderold   mean_fst mean_fst_by_region
## 1   Western Europe     9      1        1 0.07544418               0.08
## 2   Western Europe    10      2        2 0.07874215               0.08
## 3  Southern Europe    11      3        3 0.07110287               0.10
## 4  Southern Europe    13      5        5 0.09032277               0.10
## 5  Southern Europe    14      6        6 0.12206649               0.10
## 6  Southern Europe    15      7        7 0.10043809               0.10
## 7  Southern Europe    16      8        8 0.11431214               0.10
## 8  Southern Europe    17      9        9 0.09158751               0.10
## 9  Southern Europe    18     10       10 0.09227643               0.10
## 10 Southern Europe    20     12       12 0.14626402               0.10
## 11 Southern Europe    21     13       13 0.13081478               0.10
## 12 Southern Europe    22     14       14 0.06377881               0.10
## 13 Southern Europe    23     15       15 0.10269616               0.10
## 14 Southern Europe    24     16       16 0.17817200               0.10
## 15 Southern Europe    25     17       17 0.08057195               0.10
## 16 Southern Europe    26     18       18 0.07113428               0.10
## 17 Southern Europe    27     19       19 0.08929935               0.10
## 18 Southern Europe    28     20       20 0.07221817               0.10
## 19 Southern Europe    29     21       21 0.06905326               0.10
## 20 Southern Europe    30     22       22 0.06755041               0.10
## 21 Southern Europe    31     23       23 0.08731382               0.10
## 22 Southern Europe    32     24       24 0.09104076               0.10
## 23 Southern Europe    33     25       25 0.11233310               0.10
## 24 Southern Europe    34     26       26 0.09919701               0.10
## 25  Eastern Europe    35     27       27 0.20349780               0.11
## 26 Southern Europe    36     28       28 0.08888572               0.10
## 27 Southern Europe    37     29       29 0.09286274               0.10
## 28  Eastern Europe    38     30       30 0.10748444               0.11
## 29  Eastern Europe    39     31       31 0.10234234               0.11
## 30  Eastern Europe    40     32       32 0.09220266               0.11
## 31  Eastern Europe    41     33       33 0.07821586               0.11
## 32  Eastern Europe    42     34       34 0.12956210               0.11
## 33  Eastern Europe    43     35       35 0.09680747               0.11
## 34  Eastern Europe    44     36       36 0.09497907               0.11
## 35  Eastern Europe    45     37       37 0.08506636               0.11
## 36  Eastern Europe    46     38       38 0.10803478               0.11
## 37  Eastern Europe    47     39       39 0.12845411               0.11
## 38  Eastern Europe    48     40       40 0.11637869               0.11
## 39  Eastern Europe    49     41       41 0.11376564               0.11
## 40  Eastern Europe    50     42       42 0.08040068               0.11
##    mean_fst_by_country
## 1                 0.08
## 2                 0.08
## 3                 0.07
## 4                 0.10
## 5                 0.10
## 6                 0.10
## 7                 0.10
## 8                 0.10
## 9                 0.10
## 10                0.10
## 11                0.10
## 12                0.10
## 13                0.10
## 14                0.10
## 15                0.10
## 16                0.10
## 17                0.10
## 18                0.10
## 19                0.07
## 20                0.07
## 21                0.09
## 22                0.10
## 23                0.10
## 24                0.10
## 25                0.20
## 26                0.09
## 27                0.09
## 28                0.11
## 29                0.10
## 30                0.09
## 31                0.09
## 32                0.11
## 33                0.11
## 34                0.11
## 35                0.11
## 36                0.11
## 37                0.11
## 38                0.11
## 39                0.11
## 40                0.08

Mean by latitude

# Add a new column to indicate whether the latitude is above or below 30N
fst3 <- fst3 |>
  mutate(Latitude_group = ifelse(Latitude >= 40, "Above 40N", "Below 40N"))

# Summarize the data by Latitude_group and calculate the mean_fst
summary_by_latitude <- fst3 |>
  group_by(Latitude_group) |>
  summarize(mean_fst_by_latitude = mean(mean_fst, na.rm = TRUE)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_latitude column to the fst3 tibble
fst3 <- fst3 |>
  left_join(summary_by_latitude, by = "Latitude_group")


# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City)

# Print the modified fst3 tibble
print(fst3)
##                    City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##             Region order order2 orderold   mean_fst mean_fst_by_region
## 1   Western Europe     9      1        1 0.07544418               0.08
## 2   Western Europe    10      2        2 0.07874215               0.08
## 3  Southern Europe    11      3        3 0.07110287               0.10
## 4  Southern Europe    13      5        5 0.09032277               0.10
## 5  Southern Europe    14      6        6 0.12206649               0.10
## 6  Southern Europe    15      7        7 0.10043809               0.10
## 7  Southern Europe    16      8        8 0.11431214               0.10
## 8  Southern Europe    17      9        9 0.09158751               0.10
## 9  Southern Europe    18     10       10 0.09227643               0.10
## 10 Southern Europe    20     12       12 0.14626402               0.10
## 11 Southern Europe    21     13       13 0.13081478               0.10
## 12 Southern Europe    22     14       14 0.06377881               0.10
## 13 Southern Europe    23     15       15 0.10269616               0.10
## 14 Southern Europe    24     16       16 0.17817200               0.10
## 15 Southern Europe    25     17       17 0.08057195               0.10
## 16 Southern Europe    26     18       18 0.07113428               0.10
## 17 Southern Europe    27     19       19 0.08929935               0.10
## 18 Southern Europe    28     20       20 0.07221817               0.10
## 19 Southern Europe    29     21       21 0.06905326               0.10
## 20 Southern Europe    30     22       22 0.06755041               0.10
## 21 Southern Europe    31     23       23 0.08731382               0.10
## 22 Southern Europe    32     24       24 0.09104076               0.10
## 23 Southern Europe    33     25       25 0.11233310               0.10
## 24 Southern Europe    34     26       26 0.09919701               0.10
## 25  Eastern Europe    35     27       27 0.20349780               0.11
## 26 Southern Europe    36     28       28 0.08888572               0.10
## 27 Southern Europe    37     29       29 0.09286274               0.10
## 28  Eastern Europe    38     30       30 0.10748444               0.11
## 29  Eastern Europe    39     31       31 0.10234234               0.11
## 30  Eastern Europe    40     32       32 0.09220266               0.11
## 31  Eastern Europe    41     33       33 0.07821586               0.11
## 32  Eastern Europe    42     34       34 0.12956210               0.11
## 33  Eastern Europe    43     35       35 0.09680747               0.11
## 34  Eastern Europe    44     36       36 0.09497907               0.11
## 35  Eastern Europe    45     37       37 0.08506636               0.11
## 36  Eastern Europe    46     38       38 0.10803478               0.11
## 37  Eastern Europe    47     39       39 0.12845411               0.11
## 38  Eastern Europe    48     40       40 0.11637869               0.11
## 39  Eastern Europe    49     41       41 0.11376564               0.11
## 40  Eastern Europe    50     42       42 0.08040068               0.11
##    mean_fst_by_country Latitude_group mean_fst_by_latitude
## 1                 0.08      Above 40N           0.10257911
## 2                 0.08      Above 40N           0.10257911
## 3                 0.07      Above 40N           0.10257911
## 4                 0.10      Below 40N           0.09296873
## 5                 0.10      Below 40N           0.09296873
## 6                 0.10      Below 40N           0.09296873
## 7                 0.10      Above 40N           0.10257911
## 8                 0.10      Below 40N           0.09296873
## 9                 0.10      Above 40N           0.10257911
## 10                0.10      Above 40N           0.10257911
## 11                0.10      Above 40N           0.10257911
## 12                0.10      Above 40N           0.10257911
## 13                0.10      Above 40N           0.10257911
## 14                0.10      Above 40N           0.10257911
## 15                0.10      Above 40N           0.10257911
## 16                0.10      Above 40N           0.10257911
## 17                0.10      Below 40N           0.09296873
## 18                0.10      Above 40N           0.10257911
## 19                0.07      Below 40N           0.09296873
## 20                0.07      Above 40N           0.10257911
## 21                0.09      Above 40N           0.10257911
## 22                0.10      Above 40N           0.10257911
## 23                0.10      Above 40N           0.10257911
## 24                0.10      Above 40N           0.10257911
## 25                0.20      Above 40N           0.10257911
## 26                0.09      Below 40N           0.09296873
## 27                0.09      Below 40N           0.09296873
## 28                0.11      Above 40N           0.10257911
## 29                0.10      Above 40N           0.10257911
## 30                0.09      Below 40N           0.09296873
## 31                0.09      Above 40N           0.10257911
## 32                0.11      Above 40N           0.10257911
## 33                0.11      Above 40N           0.10257911
## 34                0.11      Above 40N           0.10257911
## 35                0.11      Above 40N           0.10257911
## 36                0.11      Above 40N           0.10257911
## 37                0.11      Above 40N           0.10257911
## 38                0.11      Above 40N           0.10257911
## 39                0.11      Above 40N           0.10257911
## 40                0.08      Above 40N           0.10257911
fst4 <- fst3 |>
  dplyr::select(
    Latitude_group, mean_fst_by_latitude, Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Latitude_group, Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
##   Latitude_group mean_fst_by_latitude         Region mean_fst_by_region
## 1      Above 40N                  0.1 Eastern Europe               0.11
## 2      Above 40N                  0.1 Eastern Europe               0.11
## 3      Above 40N                  0.1 Eastern Europe               0.11
## 4      Above 40N                  0.1 Eastern Europe               0.11
## 5      Above 40N                  0.1 Eastern Europe               0.11
## 6      Above 40N                  0.1 Eastern Europe               0.11
##    Country mean_fst_by_country              City Abbreviation mean_fst
## 1  Armenia                0.08            Ijevan          ARM     0.08
## 2 Bulgaria                0.10               Lom          BUL     0.10
## 3  Georgia                0.11 Sakhumi, Abkhazia          GES     0.11
## 4  Romania                0.11         Satu Mare          ROS     0.11
## 5   Russia                0.11           Armavir          RAR     0.12
## 6   Russia                0.11         Krasnodar          KRA     0.09
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values for Europe using SNP Set 3.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values for Europe using SNP Set 3.

Latitude_group

mean_fst_by_latitude

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Above 40N

0.10

Eastern Europe

0.11

Armenia

0.08

Ijevan

ARM

0.08

Above 40N

0.10

Eastern Europe

0.11

Bulgaria

0.10

Lom

BUL

0.10

Above 40N

0.10

Eastern Europe

0.11

Georgia

0.11

Sakhumi, Abkhazia

GES

0.11

Above 40N

0.10

Eastern Europe

0.11

Romania

0.11

Satu Mare

ROS

0.11

Above 40N

0.10

Eastern Europe

0.11

Russia

0.11

Armavir

RAR

0.12

Above 40N

0.10

Eastern Europe

0.11

Russia

0.11

Krasnodar

KRA

0.09

Above 40N

0.10

Eastern Europe

0.11

Russia

0.11

Sochi

SOC

0.11

Above 40N

0.10

Eastern Europe

0.11

Russia

0.11

Tikhoretsk

TIK

0.13

Above 40N

0.10

Eastern Europe

0.11

Serbia

0.20

Novi Sad

SER

0.20

Above 40N

0.10

Eastern Europe

0.11

Turkey

0.09

Hopa

TUH

0.08

Above 40N

0.10

Eastern Europe

0.11

Ukraine

0.11

Alushta

ALU

0.10

Above 40N

0.10

Eastern Europe

0.11

Ukraine

0.11

Kerch, Crimea

KER

0.09

Above 40N

0.10

Eastern Europe

0.11

Ukraine

0.11

Sevastopol, Crimea

SEV

0.13

Above 40N

0.10

Southern Europe

0.10

Albania

0.10

Durres

ALD

0.11

Above 40N

0.10

Southern Europe

0.10

Albania

0.10

Tirana

TIR

0.10

Above 40N

0.10

Southern Europe

0.10

Albania

0.10

Vlore

ALV

0.09

Above 40N

0.10

Southern Europe

0.10

Croatia

0.09

Dubrovnik

CRO

0.09

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Bologna

ITB

0.10

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Brescia

BRE

0.15

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Cesena

CES

0.18

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Desenzano

DES

0.13

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Imperia

IMP

0.09

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Puglia

ITP

0.07

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Rome (Sapienza)

ROM

0.08

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Rome (Trappola)

ITR

0.07

Above 40N

0.10

Southern Europe

0.10

Italy

0.10

Trentino

TRE

0.06

Above 40N

0.10

Southern Europe

0.10

Portugal

0.07

Penafiel

POP

0.07

Above 40N

0.10

Southern Europe

0.10

Slovenia

0.07

Ajdovscina

SLO

0.07

Above 40N

0.10

Southern Europe

0.10

Spain

0.10

Barcelona

BAR

0.11

Above 40N

0.10

Western Europe

0.08

France

0.08

Saint-Martin-d'Heres

FRS

0.08

Above 40N

0.10

Western Europe

0.08

France

0.08

Strasbourg

STS

0.08

Below 40N

0.09

Eastern Europe

0.11

Turkey

0.09

Aliaga

TUA

0.09

Below 40N

0.09

Southern Europe

0.10

Greece

0.09

Athens

GRA

0.09

Below 40N

0.09

Southern Europe

0.10

Greece

0.09

Chania

GRC

0.09

Below 40N

0.09

Southern Europe

0.10

Italy

0.10

Sicilia

SIC

0.09

Below 40N

0.09

Southern Europe

0.10

Malta

0.07

Luqa

MAL

0.07

Below 40N

0.09

Southern Europe

0.10

Spain

0.10

Badajoz

SPB

0.09

Below 40N

0.09

Southern Europe

0.10

Spain

0.10

Catarroja

SPC

0.10

Below 40N

0.09

Southern Europe

0.10

Spain

0.10

Magaluf

SPM

0.09

Below 40N

0.09

Southern Europe

0.10

Spain

0.10

San Roque

SPS

0.12

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "europe",
  "fst", 
  "fst_Europe_MAF1.docx"
  )

# Save the Word document
print(doc, target = output_path)

Without latitude

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na() |>
  dplyr::select(
    -Subregion
  )

# check output
head(fst3)
##               Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1 Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2           Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3             Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4              Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5            San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6            Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
##            Region order order2 orderold   mean_fst
## 1  Western Europe     9      1        1 0.07544418
## 2  Western Europe    10      2        2 0.07874215
## 3 Southern Europe    11      3        3 0.07110287
## 4 Southern Europe    13      5        5 0.09032277
## 5 Southern Europe    14      6        6 0.12206649
## 6 Southern Europe    15      7        7 0.10043809

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##             Region order order2 orderold   mean_fst mean_fst_by_region
## 1   Western Europe     9      1        1 0.07544418               0.08
## 2   Western Europe    10      2        2 0.07874215               0.08
## 3  Southern Europe    11      3        3 0.07110287               0.10
## 4  Southern Europe    13      5        5 0.09032277               0.10
## 5  Southern Europe    14      6        6 0.12206649               0.10
## 6  Southern Europe    15      7        7 0.10043809               0.10
## 7  Southern Europe    16      8        8 0.11431214               0.10
## 8  Southern Europe    17      9        9 0.09158751               0.10
## 9  Southern Europe    18     10       10 0.09227643               0.10
## 10 Southern Europe    20     12       12 0.14626402               0.10
## 11 Southern Europe    21     13       13 0.13081478               0.10
## 12 Southern Europe    22     14       14 0.06377881               0.10
## 13 Southern Europe    23     15       15 0.10269616               0.10
## 14 Southern Europe    24     16       16 0.17817200               0.10
## 15 Southern Europe    25     17       17 0.08057195               0.10
## 16 Southern Europe    26     18       18 0.07113428               0.10
## 17 Southern Europe    27     19       19 0.08929935               0.10
## 18 Southern Europe    28     20       20 0.07221817               0.10
## 19 Southern Europe    29     21       21 0.06905326               0.10
## 20 Southern Europe    30     22       22 0.06755041               0.10
## 21 Southern Europe    31     23       23 0.08731382               0.10
## 22 Southern Europe    32     24       24 0.09104076               0.10
## 23 Southern Europe    33     25       25 0.11233310               0.10
## 24 Southern Europe    34     26       26 0.09919701               0.10
## 25  Eastern Europe    35     27       27 0.20349780               0.11
## 26 Southern Europe    36     28       28 0.08888572               0.10
## 27 Southern Europe    37     29       29 0.09286274               0.10
## 28  Eastern Europe    38     30       30 0.10748444               0.11
## 29  Eastern Europe    39     31       31 0.10234234               0.11
## 30  Eastern Europe    40     32       32 0.09220266               0.11
## 31  Eastern Europe    41     33       33 0.07821586               0.11
## 32  Eastern Europe    42     34       34 0.12956210               0.11
## 33  Eastern Europe    43     35       35 0.09680747               0.11
## 34  Eastern Europe    44     36       36 0.09497907               0.11
## 35  Eastern Europe    45     37       37 0.08506636               0.11
## 36  Eastern Europe    46     38       38 0.10803478               0.11
## 37  Eastern Europe    47     39       39 0.12845411               0.11
## 38  Eastern Europe    48     40       40 0.11637869               0.11
## 39  Eastern Europe    49     41       41 0.11376564               0.11
## 40  Eastern Europe    50     42       42 0.08040068               0.11

Mean By country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
##                Pop_City  Country Latitude Longitude Continent Abbreviation Year
## 1  Saint-Martin-d'Heres   France 45.16531  5.771806    Europe          FRS 2019
## 2            Strasbourg   France 48.61124  7.754512    Europe          STS 2019
## 3              Penafiel Portugal 41.18555 -8.329371    Europe          POP 2017
## 4               Badajoz    Spain 38.86622 -6.974194    Europe          SPB 2018
## 5             San Roque    Spain 36.17042 -5.371530    Europe          SPS 2017
## 6             Catarroja    Spain 39.40294 -0.395514    Europe          SPC 2017
## 7             Barcelona    Spain 41.38510  2.173400    Europe          BAR 2018
## 8               Magaluf    Spain 39.50679  2.530729    Europe          SPM 2017
## 9               Imperia    Italy 43.87159  8.003559    Europe          IMP 2017
## 10              Brescia    Italy 45.53373 10.204450    Europe          BRE 1995
## 11            Desenzano    Italy 45.46289 10.549140    Europe          DES 1995
## 12             Trentino    Italy 46.05917 11.117220    Europe          TRE 2020
## 13              Bologna    Italy 44.48478 11.366584    Europe          ITB 2017
## 14               Cesena    Italy 44.15287 12.244265    Europe          CES 1995
## 15      Rome (Sapienza)    Italy 41.90215 12.517399    Europe          ROM 2017
## 16      Rome (Trappola)    Italy 41.90215 12.517399    Europe          ITR 2013
## 17              Sicilia    Italy 38.23294 15.550877    Europe          SIC 2016
## 18               Puglia    Italy 41.12213 16.844107    Europe          ITP 2016
## 19                 Luqa    Malta 35.86053 14.487028    Europe          MAL 2019
## 20           Ajdovscina Slovenia 45.88715 13.770997    Europe          SLO 2017
## 21            Dubrovnik  Croatia 42.60654 18.226612    Europe          CRO 2017
## 22                Vlore  Albania 40.46600 19.489700    Europe          ALV 2020
## 23               Durres  Albania 41.29704 19.503734    Europe          ALD 2018
## 24               Tirana  Albania 41.31473 19.831716    Europe          TIR 2017
## 25             Novi Sad   Serbia 45.25887 19.818778    Europe          SER 2019
## 26               Athens   Greece 37.93719 23.946883    Europe          GRA 2019
## 27               Chania   Greece 35.51448 24.017960    Europe          GRC 2019
## 28            Satu Mare  Romania 47.79147 22.890202    Europe          ROS 2020
## 29                  Lom Bulgaria 43.80489 23.236340    Europe          BUL 2019
## 30               Aliaga   Turkey 38.76390 26.944800    Europe          TUA 2019
## 31                 Hopa   Turkey 41.38760 41.437800    Europe          TUH 2019
## 32   Sevastopol, Crimea  Ukraine 44.54125 33.514005    Europe          SEV 2021
## 33              Alushta  Ukraine 44.68289 34.403681    Europe          ALU 2021
## 34        Kerch, Crimea  Ukraine 45.35246 36.470150    Europe          KER 2021
## 35            Krasnodar   Russia 44.95504 39.027817    Europe          KRA 2017
## 36                Sochi   Russia 43.60042 39.745328    Europe          SOC 2021
## 37           Tikhoretsk   Russia 45.85460 40.125600    Europe          TIK 2021
## 38              Armavir   Russia 44.96034 41.133057    Europe          RAR 2021
## 39    Sakhumi, Abkhazia  Georgia 43.07851 40.887588    Europe          GES 2021
## 40               Ijevan  Armenia 40.87971 45.147640    Europe          ARM 2020
##             Region order order2 orderold   mean_fst mean_fst_by_region
## 1   Western Europe     9      1        1 0.07544418               0.08
## 2   Western Europe    10      2        2 0.07874215               0.08
## 3  Southern Europe    11      3        3 0.07110287               0.10
## 4  Southern Europe    13      5        5 0.09032277               0.10
## 5  Southern Europe    14      6        6 0.12206649               0.10
## 6  Southern Europe    15      7        7 0.10043809               0.10
## 7  Southern Europe    16      8        8 0.11431214               0.10
## 8  Southern Europe    17      9        9 0.09158751               0.10
## 9  Southern Europe    18     10       10 0.09227643               0.10
## 10 Southern Europe    20     12       12 0.14626402               0.10
## 11 Southern Europe    21     13       13 0.13081478               0.10
## 12 Southern Europe    22     14       14 0.06377881               0.10
## 13 Southern Europe    23     15       15 0.10269616               0.10
## 14 Southern Europe    24     16       16 0.17817200               0.10
## 15 Southern Europe    25     17       17 0.08057195               0.10
## 16 Southern Europe    26     18       18 0.07113428               0.10
## 17 Southern Europe    27     19       19 0.08929935               0.10
## 18 Southern Europe    28     20       20 0.07221817               0.10
## 19 Southern Europe    29     21       21 0.06905326               0.10
## 20 Southern Europe    30     22       22 0.06755041               0.10
## 21 Southern Europe    31     23       23 0.08731382               0.10
## 22 Southern Europe    32     24       24 0.09104076               0.10
## 23 Southern Europe    33     25       25 0.11233310               0.10
## 24 Southern Europe    34     26       26 0.09919701               0.10
## 25  Eastern Europe    35     27       27 0.20349780               0.11
## 26 Southern Europe    36     28       28 0.08888572               0.10
## 27 Southern Europe    37     29       29 0.09286274               0.10
## 28  Eastern Europe    38     30       30 0.10748444               0.11
## 29  Eastern Europe    39     31       31 0.10234234               0.11
## 30  Eastern Europe    40     32       32 0.09220266               0.11
## 31  Eastern Europe    41     33       33 0.07821586               0.11
## 32  Eastern Europe    42     34       34 0.12956210               0.11
## 33  Eastern Europe    43     35       35 0.09680747               0.11
## 34  Eastern Europe    44     36       36 0.09497907               0.11
## 35  Eastern Europe    45     37       37 0.08506636               0.11
## 36  Eastern Europe    46     38       38 0.10803478               0.11
## 37  Eastern Europe    47     39       39 0.12845411               0.11
## 38  Eastern Europe    48     40       40 0.11637869               0.11
## 39  Eastern Europe    49     41       41 0.11376564               0.11
## 40  Eastern Europe    50     42       42 0.08040068               0.11
##    mean_fst_by_country
## 1                 0.08
## 2                 0.08
## 3                 0.07
## 4                 0.10
## 5                 0.10
## 6                 0.10
## 7                 0.10
## 8                 0.10
## 9                 0.10
## 10                0.10
## 11                0.10
## 12                0.10
## 13                0.10
## 14                0.10
## 15                0.10
## 16                0.10
## 17                0.10
## 18                0.10
## 19                0.07
## 20                0.07
## 21                0.09
## 22                0.10
## 23                0.10
## 24                0.10
## 25                0.20
## 26                0.09
## 27                0.09
## 28                0.11
## 29                0.10
## 30                0.09
## 31                0.09
## 32                0.11
## 33                0.11
## 34                0.11
## 35                0.11
## 36                0.11
## 37                0.11
## 38                0.11
## 39                0.11
## 40                0.08
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City)

fst4 <- fst3 |>
  dplyr::select(
    Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
##           Region mean_fst_by_region  Country mean_fst_by_country
## 1 Eastern Europe               0.11  Armenia                0.08
## 2 Eastern Europe               0.11 Bulgaria                0.10
## 3 Eastern Europe               0.11  Georgia                0.11
## 4 Eastern Europe               0.11  Romania                0.11
## 5 Eastern Europe               0.11   Russia                0.11
## 6 Eastern Europe               0.11   Russia                0.11
##                City Abbreviation mean_fst
## 1            Ijevan          ARM     0.08
## 2               Lom          BUL     0.10
## 3 Sakhumi, Abkhazia          GES     0.11
## 4         Satu Mare          ROS     0.11
## 5           Armavir          RAR     0.12
## 6         Krasnodar          KRA     0.09
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values for Europe using SNP Set 3.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values for Europe using SNP Set 3.

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Eastern Europe

0.11

Armenia

0.08

Ijevan

ARM

0.08

Eastern Europe

0.11

Bulgaria

0.10

Lom

BUL

0.10

Eastern Europe

0.11

Georgia

0.11

Sakhumi, Abkhazia

GES

0.11

Eastern Europe

0.11

Romania

0.11

Satu Mare

ROS

0.11

Eastern Europe

0.11

Russia

0.11

Armavir

RAR

0.12

Eastern Europe

0.11

Russia

0.11

Krasnodar

KRA

0.09

Eastern Europe

0.11

Russia

0.11

Sochi

SOC

0.11

Eastern Europe

0.11

Russia

0.11

Tikhoretsk

TIK

0.13

Eastern Europe

0.11

Serbia

0.20

Novi Sad

SER

0.20

Eastern Europe

0.11

Turkey

0.09

Aliaga

TUA

0.09

Eastern Europe

0.11

Turkey

0.09

Hopa

TUH

0.08

Eastern Europe

0.11

Ukraine

0.11

Alushta

ALU

0.10

Eastern Europe

0.11

Ukraine

0.11

Kerch, Crimea

KER

0.09

Eastern Europe

0.11

Ukraine

0.11

Sevastopol, Crimea

SEV

0.13

Southern Europe

0.10

Albania

0.10

Durres

ALD

0.11

Southern Europe

0.10

Albania

0.10

Tirana

TIR

0.10

Southern Europe

0.10

Albania

0.10

Vlore

ALV

0.09

Southern Europe

0.10

Croatia

0.09

Dubrovnik

CRO

0.09

Southern Europe

0.10

Greece

0.09

Athens

GRA

0.09

Southern Europe

0.10

Greece

0.09

Chania

GRC

0.09

Southern Europe

0.10

Italy

0.10

Bologna

ITB

0.10

Southern Europe

0.10

Italy

0.10

Brescia

BRE

0.15

Southern Europe

0.10

Italy

0.10

Cesena

CES

0.18

Southern Europe

0.10

Italy

0.10

Desenzano

DES

0.13

Southern Europe

0.10

Italy

0.10

Imperia

IMP

0.09

Southern Europe

0.10

Italy

0.10

Puglia

ITP

0.07

Southern Europe

0.10

Italy

0.10

Rome (Sapienza)

ROM

0.08

Southern Europe

0.10

Italy

0.10

Rome (Trappola)

ITR

0.07

Southern Europe

0.10

Italy

0.10

Sicilia

SIC

0.09

Southern Europe

0.10

Italy

0.10

Trentino

TRE

0.06

Southern Europe

0.10

Malta

0.07

Luqa

MAL

0.07

Southern Europe

0.10

Portugal

0.07

Penafiel

POP

0.07

Southern Europe

0.10

Slovenia

0.07

Ajdovscina

SLO

0.07

Southern Europe

0.10

Spain

0.10

Badajoz

SPB

0.09

Southern Europe

0.10

Spain

0.10

Barcelona

BAR

0.11

Southern Europe

0.10

Spain

0.10

Catarroja

SPC

0.10

Southern Europe

0.10

Spain

0.10

Magaluf

SPM

0.09

Southern Europe

0.10

Spain

0.10

San Roque

SPS

0.12

Western Europe

0.08

France

0.08

Saint-Martin-d'Heres

FRS

0.08

Western Europe

0.08

France

0.08

Strasbourg

STS

0.08

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "output",
  "europe",
  "fst", 
  "fst_Europe_MAF1_no_lat.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "output", "europe", "fst", "fst_country_MAF1.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           FRS       STS       POP       SPB       SPS       SPC      BAR
## FRS    0.0000  412.1522 1225.4383 1263.7518 1371.4448  817.6702 511.8716
## STS  412.1522    0.0000 1509.1745 1601.1573 1750.4874 1213.5639 915.9614
## POP 1225.4383 1509.1745    0.0000  282.8409  614.5101  701.9619 878.2910
## SPB 1263.7518 1601.1573  282.8409    0.0000  331.7685  571.0509 827.0716
## SPS 1371.4448 1750.4874  614.5101  331.7685    0.0000  566.5153 874.4050
## SPC  817.6702 1213.5639  701.9619  571.0509  566.5153    0.0000 310.0074
##           SPM       IMP       BRE       DES       TRE       ITB       CES
## FRS  683.8845  228.2822  349.1505  375.3876  427.8791  448.0871  524.5893
## STS 1095.6072  527.9620  389.6504  409.5459  380.8019  536.0256  604.0707
## POP  939.4811 1370.3898 1572.0008 1596.9006 1653.2807 1644.8695 1711.3013
## SPB  822.8201 1367.0861 1594.7230 1616.9360 1682.2286 1643.6694 1701.8380
## SPS  787.3372 1423.5497 1670.1969 1688.5822 1761.4648 1690.3169 1736.8728
## SPC  251.7726  857.0395 1104.4760 1122.4598 1195.9602 1124.6677 1173.2270
##           ROM       ITR      SIC      ITP      MAL       SLO      CRO      ALV
## FRS  654.1062  654.1062 1119.260 1004.474 1269.904  628.7258 1037.845 1234.162
## STS  834.5057  834.5057 1314.450 1098.126 1522.503  546.2343 1052.803 1296.908
## POP 1734.4434 1734.4434 2064.584 2102.566 2066.730 1851.8481 2197.100 2334.720
## SPB 1683.0686 1683.0686 1957.316 2040.301 1923.429 1869.2123 2157.626 2266.196
## SPS 1669.1413 1669.1413 1865.247 2002.327 1785.308 1929.6259 2144.409 2215.487
## SPC 1124.2692 1124.2692 1387.383 1474.434 1368.139 1363.2153 1600.981 1697.872
##          ALD      TIR       SER      GRA      GRC      ROS      BUL      TUA
## FRS 1192.210 1216.107 1100.2591 1709.174 1877.379 1341.427 1392.500 1884.818
## STS 1230.239 1248.235  988.6774 1763.684 1974.712 1124.852 1304.105 1885.544
## POP 2319.742 2346.521 2315.5228 2777.366 2876.039 2566.256 2591.385 3000.557
## SPB 2262.516 2289.859 2313.3538 2686.566 2760.206 2594.045 2569.099 2924.845
## SPS 2225.649 2253.121 2340.8167 2601.355 2642.916 2651.131 2568.416 2854.982
## SPC 1697.515 1724.957 1780.6744 2115.539 2193.176 2085.530 2019.617 2354.477
##          TUH      SEV      ALU      KER      KRA      SOC      TIK      RAR
## FRS 2897.030 2179.621 2245.512 2390.859 2596.424 2688.509 2660.271 2758.005
## STS 2745.357 2012.146 2071.666 2197.341 2400.939 2511.584 2447.357 2555.484
## POP 4103.736 3397.015 3464.308 3613.507 3818.891 3906.072 3885.080 3981.365
## SPB 4076.402 3387.944 3457.259 3614.231 3818.203 3894.554 3893.085 3982.756
## SPS 4053.790 3392.537 3463.561 3628.220 3828.858 3891.912 3913.323 3994.595
## SPC 3518.971 2842.701 2913.061 3074.730 3276.925 3346.038 3357.781 3442.342
##          GES      ARM
## FRS 2793.592 3207.499
## STS 2619.862 3045.177
## POP 4009.674 4417.211
## SPB 3995.220 4391.989
## SPS 3988.376 4369.793
## SPC 3444.704 3834.995

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)
##   Distance FST row_index col_index
## 1        0   0       FRS       FRS
## 2        0   0       STS       STS
## 3        0   0       POP       POP
## 4        0   0       SPB       SPB
## 5        0   0       SPS       SPS
## 6        0   0       SPC       SPC

Fit linear regression

data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
 annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14                               ))  # Increase font size for y-axi
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "fst_by_distance_europe_MAF1_with_equ.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

1.4 Subset by country

Select countries with at least 3 sampling localities in Europe

countries_with_3_pops <- filtered_sampling_loc %>%
  group_by(Country) %>%
  filter(n() >= 3) %>%
  pull(Country) %>%
  unique()
countries_with_3_pops
## [1] "Spain"   "Italy"   "Albania" "Ukraine" "Russia"

Do test for each country

results <- list()

for (country in countries_with_3_pops) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc %>%
    filter(Country == country) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  subset_data <- subset_data[subset_data$Distance > 0, ]
  # Perform linear regression
  lm_model <- lm(FST/(1-FST) ~ log(Distance), data = subset_data)
  results[[country]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $Spain
## $Spain$equation
## [1] "y = -0.01500x + 0.191"
## 
## $Spain$r2
## [1] "R^2 = 0.02"
## 
## 
## $Italy
## $Italy$equation
## [1] "y = -0.01389x + 0.190"
## 
## $Italy$r2
## [1] "R^2 = 0.03"
## 
## 
## $Albania
## $Albania$equation
## [1] "y = -0.01524x + 0.080"
## 
## $Albania$r2
## [1] "R^2 = 0.49"
## 
## 
## $Ukraine
## $Ukraine$equation
## [1] "y = -0.02810x + 0.223"
## 
## $Ukraine$r2
## [1] "R^2 = 0.90"
## 
## 
## $Russia
## $Russia$equation
## [1] "y = 0.01290x + -0.034"
## 
## $Russia$r2
## [1] "R^2 = 0.02"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = Country) %>%
  left_join(filtered_sampling_loc[, c("Pop_City", "Country", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == Country)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
countries_to_include <- c("Spain", "Italy", "Albania", "Ukraine", "Russia")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% countries_to_include) %>%
  ungroup()

Calculate linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST/(1-FST) ~ log(Distance), data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("italic(y) == %.3f * italic(x) + %.3f", coef(model)[2], coef(model)[1]),
         r2 = sprintf("italic(R)^2 == %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(
    data = regression_results, 
    aes(label = paste(equation, r2, sep = "~~")), 
    x = Inf, y = Inf, 
    vjust = 2, hjust = 1.2, 
    size = 3.5, 
    parse = TRUE, 
    inherit.aes = FALSE
  ) +
  labs(title = "Fst vs Distance by Country",
       x = "Log(Distance)",
       y = "Fst(1-Fst)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "fst_by_distance_countries_Europe_MAF1.pdf"),
  width = 6,
  height = 8,
  units = "in"
)

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     ALD    ALU    ALV    ARM    BAR    BRE    BUL    CES    CRO    DES   
## ALD NA     "0.12" "0.02" "0.11" "0.12" "0.17" "0.11" "0.20" "0.05" "0.16"
## ALU "1268" NA     "0.10" "0.06" "0.11" "0.16" "0.10" "0.19" "0.09" "0.15"
## ALV "93"   "1307" NA     "0.09" "0.10" "0.16" "0.09" "0.18" "0.03" "0.14"
## ARM "2144" "974"  "2159" NA     "0.11" "0.13" "0.10" "0.16" "0.08" "0.11"
## BAR "1446" "2631" "1458" "3566" NA     "0.18" "0.11" "0.21" "0.09" "0.17"
## BRE "887"  "1896" "942"  "2859" "796"  NA     "0.15" "0.16" "0.15" "0.09"
## BUL "414"  "895"  "483"  "1826" "1742" "1048" NA     "0.19" "0.09" "0.14"
## CES "673"  "1757" "724"  "2706" "878"  "223"  "881"  NA     "0.18" "0.14"
## CRO "180"  "1321" "260"  "2235" "1333" "719"  "428"  "514"  NA     "0.13"
## DES "859"  "1870" "915"  "2832" "815"  "28"   "1021" "198"  "691"  NA    
## FRS "1192" "2246" "1234" "3207" "512"  "349"  "1393" "525"  "1038" "375" 
## GES "1770" "550"  "1795" "429"  "3168" "2444" "1426" "2300" "1845" "2418"
## GRA "534"  "1151" "476"  "1848" "1900" "1417" "656"  "1200" "711"  "1390"
## GRC "754"  "1348" "679"  "1937" "2007" "1611" "925"  "1389" "934"  "1585"
## IMP "984"  "2097" "1020" "3047" "552"  "254"  "1221" "341"  "840"  "268" 
## ITB "752"  "1821" "802"  "2773" "824"  "148"  "950"  "79"   "591"  "126" 
## ITP "224"  "1482" "235"  "2367" "1227" "728"  "604"  "506"  "201"  "702" 
## ITR "585"  "1794" "605"  "2711" "862"  "445"  "899"  "252"  "477"  "427" 
## KER "1443" "179"  "1484" "862"  "2789" "2042" "1062" "1913" "1490" "2017"
## KRA "1633" "366"  "1670" "674"  "2992" "2248" "1261" "2117" "1687" "2222"
## MAL "746"  "1948" "674"  "2719" "1233" "1135" "1157" "943"  "817"  "1119"
## POP "2320" "3464" "2335" "4417" "878"  "1572" "2591" "1711" "2197" "1597"
## RAR "1798" "532"  "1833" "560"  "3157" "2410" "1427" "2281" "1853" "2385"
## ROM "585"  "1794" "605"  "2711" "862"  "445"  "899"  "252"  "477"  "427" 
## ROS "771"  "950"  "859"  "1923" "1783" "1000" "445"  "917"  "683"  "977" 
## SER "442"  "1149" "534"  "2107" "1489" "752"  "316"  "612"  "322"  "725" 
## SEV "1196" "72"   "1235" "1034" "2561" "1830" "824"  "1689" "1250" "1804"
## SIC "480"  "1724" "420"  "2545" "1195" "925"  "894"  "715"  "537"  "905" 
## SLO "689"  "1617" "761"  "2581" "1059" "280"  "782"  "227"  "509"  "255" 
## SOC "1678" "443"  "1706" "538"  "3068" "2339" "1327" "2197" "1748" "2313"
## SPB "2263" "3457" "2266" "4392" "827"  "1595" "2569" "1702" "2158" "1617"
## SPC "1698" "2913" "1698" "3835" "310"  "1104" "2020" "1173" "1601" "1122"
## SPM "1450" "2677" "1448" "3591" "211"  "919"  "1782" "956"  "1360" "933" 
## SPS "2226" "3464" "2215" "4370" "874"  "1670" "2568" "1737" "2144" "1689"
## STS "1230" "2072" "1297" "3045" "916"  "390"  "1304" "604"  "1053" "410" 
## TIK "1733" "467"  "1774" "687"  "3069" "2313" "1350" "2191" "1778" "2288"
## TIR "27"   "1242" "99"   "2117" "1473" "908"  "393"  "696"  "196"  "881" 
## TRE "858"  "1821" "920"  "2787" "887"  "92"   "986"  "230"  "684"  "80"  
## TUA "694"  "904"  "667"  "1571" "2123" "1569" "641"  "1363" "851"  "1542"
## TUH "1828" "679"  "1844" "316"  "3251" "2549" "1512" "2393" "1919" "2522"
##     FRS    GES    GRA    GRC    IMP    ITB    ITP    ITR    KER    KRA   
## ALD "0.09" "0.14" "0.08" "0.07" "0.11" "0.12" "0.08" "0.08" "0.12" "0.11"
## ALU "0.08" "0.09" "0.08" "0.09" "0.09" "0.10" "0.08" "0.07" "0.07" "0.05"
## ALV "0.07" "0.12" "0.06" "0.05" "0.08" "0.10" "0.06" "0.06" "0.10" "0.09"
## ARM "0.06" "0.06" "0.08" "0.08" "0.08" "0.09" "0.07" "0.07" "0.04" "0.02"
## BAR "0.09" "0.14" "0.09" "0.10" "0.11" "0.13" "0.08" "0.07" "0.12" "0.11"
## BRE "0.12" "0.18" "0.15" "0.15" "0.13" "0.12" "0.13" "0.14" "0.15" "0.15"
## BUL "0.07" "0.13" "0.09" "0.09" "0.09" "0.10" "0.06" "0.05" "0.11" "0.10"
## CES "0.15" "0.21" "0.18" "0.19" "0.18" "0.17" "0.16" "0.17" "0.18" "0.18"
## CRO "0.06" "0.12" "0.06" "0.05" "0.08" "0.09" "0.05" "0.05" "0.10" "0.09"
## DES "0.11" "0.17" "0.14" "0.14" "0.12" "0.11" "0.11" "0.13" "0.14" "0.14"
## FRS NA     "0.11" "0.07" "0.07" "0.05" "0.06" "0.04" "0.04" "0.08" "0.08"
## GES "2794" NA     "0.11" "0.12" "0.13" "0.14" "0.11" "0.10" "0.05" "0.02"
## GRA "1709" "1540" NA     "0.02" "0.08" "0.10" "0.05" "0.05" "0.10" "0.08"
## GRC "1877" "1675" "270"  NA     "0.09" "0.10" "0.06" "0.06" "0.10" "0.09"
## IMP "228"  "2640" "1492" "1653" NA     "0.08" "0.05" "0.05" "0.09" "0.09"
## ITB "448"  "2365" "1279" "1467" "277"  NA     "0.07" "0.07" "0.11" "0.11"
## ITP "1004" "1991" "705"  "884"  "787"  "583"  NA     "0.02" "0.08" "0.07"
## ITR "654"  "2321" "1070" "1224" "428"  "302"  "371"  NA     "0.08" "0.07"
## KER "2391" "434"  "1327" "1517" "2250" "1973" "1655" "1960" NA     "0.03"
## KRA "2596" "257"  "1478" "1648" "2455" "2178" "1848" "2160" "206"  NA    
## MAL "1270" "2395" "873"  "862"  "1049" "996"  "621"  "694"  "2127" "2301"
## POP "1225" "4010" "2777" "2876" "1370" "1645" "2103" "1734" "3614" "3819"
## RAR "2758" "210"  "1629" "1789" "2619" "2342" "2014" "2326" "369"  "166" 
## ROM "654"  "2321" "1070" "1224" "428"  "302"  "371"  "0"    "1960" "2160"
## ROS "1341" "1496" "1100" "1370" "1232" "961"  "884"  "1047" "1073" "1276"
## SER "1100" "1695" "884"  "1141" "949"  "672"  "520"  "697"  "1301" "1506"
## SEV "2180" "614"  "1086" "1288" "2029" "1753" "1410" "1723" "250"  "438" 
## SIC "1119" "2197" "736"  "812"  "891"  "779"  "340"  "483"  "1901" "2084"
## SLO "629"  "2166" "1220" "1440" "507"  "245"  "585"  "455"  "1762" "1968"
## SOC "2689" "109"  "1470" "1618" "2537" "2262" "1898" "2224" "325"  "161" 
## SPB "1264" "3995" "2687" "2760" "1367" "1644" "2040" "1683" "3614" "3818"
## SPC "818"  "3445" "2116" "2193" "857"  "1125" "1474" "1124" "3075" "3277"
## SPM "684"  "3204" "1864" "1944" "665"  "916"  "1227" "883"  "2841" "3042"
## SPS "1371" "3988" "2601" "2643" "1424" "1690" "2002" "1669" "3628" "3829"
## STS "412"  "2620" "1764" "1975" "528"  "536"  "1098" "835"  "2197" "2401"
## TIK "2660" "315"  "1600" "1776" "2527" "2250" "1945" "2247" "290"  "132" 
## TIR "1216" "1743" "515"  "742"  "1009" "775"  "251"  "612"  "1417" "1607"
## TRE "428"  "2370" "1391" "1596" "346"  "176"  "717"  "476"  "1964" "2170"
## TUA "1885" "1265" "277"  "445"  "1678" "1441" "901"  "1271" "1075" "1214"
## TUH "2897" "194"  "1544" "1650" "2734" "2461" "2051" "2395" "597"  "443" 
##     MAL    POP    RAR    ROM    ROS    SER    SEV    SIC    SLO    SOC   
## ALD "0.08" "0.09" "0.15" "0.10" "0.12" "0.22" "0.16" "0.11" "0.08" "0.14"
## ALU "0.07" "0.08" "0.08" "0.08" "0.11" "0.20" "0.09" "0.10" "0.07" "0.07"
## ALV "0.06" "0.07" "0.13" "0.07" "0.10" "0.18" "0.14" "0.09" "0.06" "0.12"
## ARM "0.06" "0.05" "0.05" "0.08" "0.09" "0.19" "0.07" "0.08" "0.05" "0.05"
## BAR "0.08" "0.09" "0.14" "0.08" "0.12" "0.22" "0.16" "0.10" "0.08" "0.14"
## BRE "0.12" "0.11" "0.19" "0.16" "0.14" "0.25" "0.20" "0.12" "0.10" "0.18"
## BUL "0.06" "0.07" "0.14" "0.07" "0.10" "0.20" "0.15" "0.08" "0.07" "0.13"
## CES "0.15" "0.13" "0.22" "0.19" "0.17" "0.28" "0.23" "0.16" "0.14" "0.21"
## CRO "0.06" "0.06" "0.12" "0.05" "0.10" "0.17" "0.14" "0.08" "0.06" "0.11"
## DES "0.11" "0.08" "0.18" "0.14" "0.11" "0.23" "0.18" "0.09" "0.08" "0.17"
## FRS "0.04" "0.04" "0.11" "0.04" "0.07" "0.16" "0.12" "0.06" "0.04" "0.10"
## GES "0.10" "0.10" "0.03" "0.11" "0.14" "0.23" "0.05" "0.12" "0.10" "0.01"
## GRA "0.05" "0.07" "0.12" "0.05" "0.10" "0.19" "0.13" "0.08" "0.05" "0.11"
## GRC "0.06" "0.07" "0.12" "0.06" "0.10" "0.19" "0.13" "0.08" "0.06" "0.12"
## IMP "0.04" "0.05" "0.13" "0.06" "0.09" "0.23" "0.15" "0.07" "0.05" "0.12"
## ITB "0.06" "0.05" "0.14" "0.08" "0.10" "0.22" "0.16" "0.08" "0.06" "0.13"
## ITP "0.02" "0.04" "0.11" "0.02" "0.08" "0.17" "0.12" "0.04" "0.04" "0.10"
## ITR "0.02" "0.03" "0.10" "0.01" "0.08" "0.16" "0.12" "0.05" "0.04" "0.10"
## KER "0.08" "0.07" "0.05" "0.09" "0.11" "0.20" "0.07" "0.10" "0.07" "0.05"
## KRA "0.07" "0.07" "0.01" "0.08" "0.11" "0.20" "0.03" "0.09" "0.06" "0.01"
## MAL NA     "0.03" "0.11" "0.02" "0.07" "0.16" "0.12" "0.05" "0.04" "0.09"
## POP "2067" NA     "0.11" "0.04" "0.07" "0.17" "0.12" "0.05" "0.04" "0.10"
## RAR "2460" "3981" NA     "0.12" "0.14" "0.24" "0.05" "0.13" "0.10" "0.02"
## ROM "694"  "1734" "2326" NA     "0.09" "0.21" "0.13" "0.06" "0.04" "0.11"
## ROS "1498" "2566" "1432" "1047" NA     "0.20" "0.15" "0.09" "0.06" "0.13"
## SER "1139" "2316" "1670" "697"  "367"  NA     "0.25" "0.19" "0.15" "0.23"
## SEV "1878" "3397" "604"  "1723" "894"  "1082" NA     "0.14" "0.11" "0.04"
## SIC "280"  "2065" "2246" "483"  "1219" "858"  "1652" NA     "0.05" "0.12"
## SLO "1118" "1852" "2130" "455"  "726"  "476"  "1551" "865"  NA     "0.09"
## SOC "2316" "3906" "187"  "2224" "1388" "1591" "509"  "2113" "2060" NA    
## SPB "1923" "283"  "3983" "1683" "2594" "2313" "3388" "1957" "1869" "3895"
## SPC "1368" "702"  "3442" "1124" "2086" "1781" "2843" "1387" "1363" "3346"
## SPM "1128" "939"  "3208" "883"  "1873" "1555" "2606" "1136" "1160" "3107"
## SPS "1785" "615"  "3995" "1669" "2651" "2341" "3393" "1865" "1930" "3892"
## STS "1523" "1509" "2555" "835"  "1125" "989"  "2012" "1314" "546"  "2512"
## TIK "2414" "3885" "127"  "2247" "1328" "1580" "539"  "2190" "2033" "253" 
## TIR "764"  "2347" "1772" "612"  "761"  "439"  "1170" "502"  "705"  "1651"
## TRE "1170" "1653" "2331" "476"  "915"  "683"  "1756" "945"  "206"  "2264"
## TUA "1148" "3001" "1361" "1271" "1057" "932"  "843"  "994"  "1340" "1198"
## TUH "2412" "4104" "398"  "2395" "1629" "1797" "734"  "2233" "2272" "283" 
##     SPB    SPC    SPM    SPS    STS    TIK    TIR     TRE    TUA    TUH   
## ALD "0.12" "0.13" "0.11" "0.14" "0.10" "0.16" "0.03"  "0.08" "0.10" "0.09"
## ALU "0.10" "0.11" "0.10" "0.13" "0.08" "0.09" "0.10"  "0.07" "0.09" "0.08"
## ALV "0.10" "0.11" "0.10" "0.11" "0.07" "0.14" "-0.00" "0.06" "0.08" "0.07"
## ARM "0.06" "0.07" "0.07" "0.12" "0.07" "0.07" "0.09"  "0.05" "0.09" "0.07"
## BAR "0.12" "0.13" "0.11" "0.11" "0.09" "0.15" "0.10"  "0.08" "0.06" "0.04"
## BRE "0.10" "0.12" "0.12" "0.18" "0.12" "0.19" "0.17"  "0.09" "0.16" "0.13"
## BUL "0.10" "0.10" "0.10" "0.12" "0.08" "0.15" "0.10"  "0.06" "0.08" "0.07"
## CES "0.13" "0.15" "0.15" "0.22" "0.15" "0.22" "0.21"  "0.12" "0.19" "0.16"
## CRO "0.09" "0.10" "0.09" "0.11" "0.06" "0.13" "0.02"  "0.05" "0.07" "0.07"
## DES "0.08" "0.10" "0.10" "0.17" "0.11" "0.18" "0.15"  "0.07" "0.14" "0.12"
## FRS "0.06" "0.07" "0.06" "0.10" "0.03" "0.12" "0.07"  "0.03" "0.07" "0.05"
## GES "0.12" "0.13" "0.13" "0.16" "0.11" "0.05" "0.14"  "0.10" "0.12" "0.11"
## GRA "0.09" "0.10" "0.09" "0.10" "0.07" "0.13" "0.06"  "0.06" "0.07" "0.06"
## GRC "0.09" "0.11" "0.10" "0.11" "0.07" "0.13" "0.06"  "0.06" "0.08" "0.07"
## IMP "0.07" "0.08" "0.08" "0.10" "0.05" "0.15" "0.10"  "0.04" "0.08" "0.06"
## ITB "0.08" "0.09" "0.09" "0.13" "0.07" "0.16" "0.11"  "0.04" "0.10" "0.08"
## ITP "0.07" "0.08" "0.06" "0.09" "0.04" "0.12" "0.06"  "0.03" "0.06" "0.04"
## ITR "0.08" "0.09" "0.07" "0.07" "0.04" "0.12" "0.06"  "0.04" "0.04" "0.03"
## KER "0.09" "0.10" "0.10" "0.13" "0.08" "0.07" "0.11"  "0.07" "0.10" "0.09"
## KRA "0.09" "0.10" "0.09" "0.13" "0.08" "0.04" "0.10"  "0.07" "0.09" "0.08"
## MAL "0.07" "0.07" "0.06" "0.07" "0.04" "0.11" "0.07"  "0.03" "0.05" "0.04"
## POP "0.05" "0.05" "0.05" "0.09" "0.04" "0.12" "0.07"  "0.03" "0.06" "0.05"
## RAR "0.13" "0.14" "0.13" "0.16" "0.11" "0.05" "0.14"  "0.11" "0.12" "0.11"
## ROM "0.08" "0.10" "0.08" "0.09" "0.04" "0.13" "0.07"  "0.03" "0.05" "0.04"
## ROS "0.09" "0.10" "0.09" "0.13" "0.08" "0.15" "0.12"  "0.05" "0.10" "0.08"
## SER "0.20" "0.22" "0.21" "0.24" "0.18" "0.25" "0.22"  "0.15" "0.20" "0.17"
## SEV "0.14" "0.15" "0.15" "0.17" "0.13" "0.08" "0.15"  "0.12" "0.14" "0.12"
## SIC "0.08" "0.09" "0.07" "0.11" "0.06" "0.14" "0.09"  "0.04" "0.08" "0.06"
## SLO "0.05" "0.06" "0.05" "0.09" "0.04" "0.11" "0.06"  "0.02" "0.06" "0.04"
## SOC "0.12" "0.13" "0.12" "0.15" "0.11" "0.06" "0.13"  "0.10" "0.12" "0.10"
## SPB NA     "0.01" "0.01" "0.13" "0.06" "0.14" "0.10"  "0.04" "0.10" "0.08"
## SPC "571"  NA     "0.03" "0.14" "0.07" "0.15" "0.11"  "0.05" "0.11" "0.08"
## SPM "823"  "252"  NA     "0.08" "0.07" "0.14" "0.11"  "0.04" "0.09" "0.07"
## SPS "332"  "567"  "787"  NA     "0.10" "0.17" "0.13"  "0.08" "0.03" "0.09"
## STS "1601" "1214" "1096" "1750" NA     "0.12" "0.08"  "0.03" "0.07" "0.05"
## TIK "3893" "3358" "3126" "3913" "2447" NA     "0.15"  "0.11" "0.14" "0.12"
## TIR "2290" "1725" "1478" "2253" "1248" "1707" NA      "0.06" "0.08" "0.07"
## TRE "1682" "1196" "1011" "1761" "381"  "2233" "877"   NA     "0.06" "0.04"
## TUA "2925" "2354" "2103" "2855" "1886" "1339" "669"   "1529" NA     "0.05"
## TUH "4076" "3519" "3275" "4054" "2745" "508"  "1801"  "2478" "1267" NA
cities <- readRDS(here("output", "sampling_loc_euro_global.rds"))
cities <- as_tibble(cities)
head(cities)
## # A tibble: 6 × 12
##   Pop_City     Country Latitude Longitude Continent Abbreviation Year  Region   
##   <chr>        <chr>      <dbl>     <dbl> <chr>     <chr>        <chr> <chr>    
## 1 Berlin, NJ   USA        39.8      -74.9 Americas  BER          2018  North Am…
## 2 Columbus, OH USA        40.0      -82.9 Americas  COL          2015  North Am…
## 3 Palm Beach   USA        26.7      -80.0 Americas  PAL          2018  North Am…
## 4 Houston, TX  USA        29.8      -95.4 Americas  HOU          2018  North Am…
## 5 Los Angeles  USA        34.1     -118.  Americas  LOS          2018  North Am…
## 6 Manaus, AM   Brazil     -3.09     -60.0 Americas  MAU          2017  South Am…
## # ℹ 4 more variables: Subregion <chr>, order <int>, order2 <int>,
## #   orderold <int>

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     CRO    ALD    ITP    TIR    SER    ALV    CES    ITR    ROM    SLO   
## CRO NA     "0.05" "0.05" "0.02" "0.17" "0.03" "0.18" "0.05" "0.05" "0.06"
## ALD "180"  NA     "0.08" "0.03" "0.22" "0.02" "0.20" "0.08" "0.10" "0.08"
## ITP "201"  "224"  NA     "0.06" "0.17" "0.06" "0.16" "0.02" "0.02" "0.04"
## TIR "196"  "27"   "251"  NA     "0.22" "0.00" "0.21" "0.06" "0.07" "0.06"
## SER "322"  "442"  "520"  "439"  NA     "0.18" "0.28" "0.16" "0.21" "0.15"
## ALV "260"  "93"   "235"  "99"   "534"  NA     "0.18" "0.06" "0.07" "0.06"
## CES "514"  "673"  "506"  "696"  "612"  "724"  NA     "0.17" "0.19" "0.14"
## ITR "477"  "585"  "371"  "612"  "697"  "605"  "252"  NA     "0.01" "0.04"
## ROM "477"  "585"  "371"  "612"  "697"  "605"  "252"  "0"    NA     "0.04"
## SLO "509"  "689"  "585"  "705"  "476"  "761"  "227"  "455"  "455"  NA    
## ITB "591"  "752"  "583"  "775"  "672"  "802"  "79"   "302"  "302"  "245" 
## BUL "428"  "414"  "604"  "393"  "316"  "483"  "881"  "899"  "899"  "782" 
## DES "691"  "859"  "702"  "881"  "725"  "915"  "198"  "427"  "427"  "255" 
## TRE "684"  "858"  "717"  "877"  "683"  "920"  "230"  "476"  "476"  "206" 
## BRE "719"  "887"  "728"  "908"  "752"  "942"  "223"  "445"  "445"  "280" 
## SIC "537"  "480"  "340"  "502"  "858"  "420"  "715"  "483"  "483"  "865" 
## ROS "683"  "771"  "884"  "761"  "367"  "859"  "917"  "1047" "1047" "726" 
## IMP "840"  "984"  "787"  "1009" "949"  "1020" "341"  "428"  "428"  "507" 
## GRA "711"  "534"  "705"  "515"  "884"  "476"  "1200" "1070" "1070" "1220"
## TUA "851"  "694"  "901"  "669"  "932"  "667"  "1363" "1271" "1271" "1340"
## FRS "1038" "1192" "1004" "1216" "1100" "1234" "525"  "654"  "654"  "629" 
## MAL "817"  "746"  "621"  "764"  "1139" "674"  "943"  "694"  "694"  "1118"
## STS "1053" "1230" "1098" "1248" "989"  "1297" "604"  "835"  "835"  "546" 
## GRC "934"  "754"  "884"  "742"  "1141" "679"  "1389" "1224" "1224" "1440"
## SEV "1250" "1196" "1410" "1170" "1082" "1235" "1689" "1723" "1723" "1551"
## ALU "1321" "1268" "1482" "1242" "1149" "1307" "1757" "1794" "1794" "1617"
## BAR "1333" "1446" "1227" "1473" "1489" "1458" "878"  "862"  "862"  "1059"
## SPM "1360" "1450" "1227" "1478" "1555" "1448" "956"  "883"  "883"  "1160"
## KER "1490" "1443" "1655" "1417" "1301" "1484" "1913" "1960" "1960" "1762"
## KRA "1687" "1633" "1848" "1607" "1506" "1670" "2117" "2160" "2160" "1968"
## SPC "1601" "1698" "1474" "1725" "1781" "1698" "1173" "1124" "1124" "1363"
## SOC "1748" "1678" "1898" "1651" "1591" "1706" "2197" "2224" "2224" "2060"
## TIK "1778" "1733" "1945" "1707" "1580" "1774" "2191" "2247" "2247" "2033"
## GES "1845" "1770" "1991" "1743" "1695" "1795" "2300" "2321" "2321" "2166"
## RAR "1853" "1798" "2014" "1772" "1670" "1833" "2281" "2326" "2326" "2130"
## TUH "1919" "1828" "2051" "1801" "1797" "1844" "2393" "2395" "2395" "2272"
## ARM "2235" "2144" "2367" "2117" "2107" "2159" "2706" "2711" "2711" "2581"
## SPB "2158" "2263" "2040" "2290" "2313" "2266" "1702" "1683" "1683" "1869"
## SPS "2144" "2226" "2002" "2253" "2341" "2215" "1737" "1669" "1669" "1930"
## POP "2197" "2320" "2103" "2347" "2316" "2335" "1711" "1734" "1734" "1852"
##     ITB    BUL    DES    TRE    BRE    SIC    ROS    IMP    GRA    TUA   
## CRO "0.09" "0.09" "0.13" "0.05" "0.15" "0.08" "0.10" "0.08" "0.06" "0.07"
## ALD "0.12" "0.11" "0.16" "0.08" "0.17" "0.11" "0.12" "0.11" "0.08" "0.10"
## ITP "0.07" "0.06" "0.11" "0.03" "0.13" "0.04" "0.08" "0.05" "0.05" "0.06"
## TIR "0.11" "0.10" "0.15" "0.06" "0.17" "0.09" "0.12" "0.10" "0.06" "0.08"
## SER "0.22" "0.20" "0.23" "0.15" "0.25" "0.19" "0.20" "0.23" "0.19" "0.20"
## ALV "0.10" "0.09" "0.14" "0.06" "0.16" "0.09" "0.10" "0.08" "0.06" "0.08"
## CES "0.17" "0.19" "0.14" "0.12" "0.16" "0.16" "0.17" "0.18" "0.18" "0.19"
## ITR "0.07" "0.05" "0.13" "0.04" "0.14" "0.05" "0.08" "0.05" "0.05" "0.04"
## ROM "0.08" "0.07" "0.14" "0.03" "0.16" "0.06" "0.09" "0.06" "0.05" "0.05"
## SLO "0.06" "0.07" "0.08" "0.02" "0.10" "0.05" "0.06" "0.05" "0.05" "0.06"
## ITB NA     "0.10" "0.11" "0.04" "0.12" "0.08" "0.10" "0.08" "0.10" "0.10"
## BUL "950"  NA     "0.14" "0.06" "0.15" "0.08" "0.10" "0.09" "0.09" "0.08"
## DES "126"  "1021" NA     "0.07" "0.09" "0.09" "0.11" "0.12" "0.14" "0.14"
## TRE "176"  "986"  "80"   NA     "0.09" "0.04" "0.05" "0.04" "0.06" "0.06"
## BRE "148"  "1048" "28"   "92"   NA     "0.12" "0.14" "0.13" "0.15" "0.16"
## SIC "779"  "894"  "905"  "945"  "925"  NA     "0.09" "0.07" "0.08" "0.08"
## ROS "961"  "445"  "977"  "915"  "1000" "1219" NA     "0.09" "0.10" "0.10"
## IMP "277"  "1221" "268"  "346"  "254"  "891"  "1232" NA     "0.08" "0.08"
## GRA "1279" "656"  "1390" "1391" "1417" "736"  "1100" "1492" NA     "0.07"
## TUA "1441" "641"  "1542" "1529" "1569" "994"  "1057" "1678" "277"  NA    
## FRS "448"  "1393" "375"  "428"  "349"  "1119" "1341" "228"  "1709" "1885"
## MAL "996"  "1157" "1119" "1170" "1135" "280"  "1498" "1049" "873"  "1148"
## STS "536"  "1304" "410"  "381"  "390"  "1314" "1125" "528"  "1764" "1886"
## GRC "1467" "925"  "1585" "1596" "1611" "812"  "1370" "1653" "270"  "445" 
## SEV "1753" "824"  "1804" "1756" "1830" "1652" "894"  "2029" "1086" "843" 
## ALU "1821" "895"  "1870" "1821" "1896" "1724" "950"  "2097" "1151" "904" 
## BAR "824"  "1742" "815"  "887"  "796"  "1195" "1783" "552"  "1900" "2123"
## SPM "916"  "1782" "933"  "1011" "919"  "1136" "1873" "665"  "1864" "2103"
## KER "1973" "1062" "2017" "1964" "2042" "1901" "1073" "2250" "1327" "1075"
## KRA "2178" "1261" "2222" "2170" "2248" "2084" "1276" "2455" "1478" "1214"
## SPC "1125" "2020" "1122" "1196" "1104" "1387" "2086" "857"  "2116" "2354"
## SOC "2262" "1327" "2313" "2264" "2339" "2113" "1388" "2537" "1470" "1198"
## TIK "2250" "1350" "2288" "2233" "2313" "2190" "1328" "2527" "1600" "1339"
## GES "2365" "1426" "2418" "2370" "2444" "2197" "1496" "2640" "1540" "1265"
## RAR "2342" "1427" "2385" "2331" "2410" "2246" "1432" "2619" "1629" "1361"
## TUH "2461" "1512" "2522" "2478" "2549" "2233" "1629" "2734" "1544" "1267"
## ARM "2773" "1826" "2832" "2787" "2859" "2545" "1923" "3047" "1848" "1571"
## SPB "1644" "2569" "1617" "1682" "1595" "1957" "2594" "1367" "2687" "2925"
## SPS "1690" "2568" "1689" "1761" "1670" "1865" "2651" "1424" "2601" "2855"
## POP "1645" "2591" "1597" "1653" "1572" "2065" "2566" "1370" "2777" "3001"
##     FRS    MAL    STS    GRC    SEV    ALU    BAR    SPM    KER    KRA   
## CRO "0.06" "0.06" "0.06" "0.05" "0.14" "0.09" "0.09" "0.09" "0.10" "0.09"
## ALD "0.09" "0.08" "0.10" "0.07" "0.16" "0.12" "0.12" "0.11" "0.12" "0.11"
## ITP "0.04" "0.02" "0.04" "0.06" "0.12" "0.08" "0.08" "0.06" "0.08" "0.07"
## TIR "0.07" "0.07" "0.08" "0.06" "0.15" "0.10" "0.10" "0.11" "0.11" "0.10"
## SER "0.16" "0.16" "0.18" "0.19" "0.25" "0.20" "0.22" "0.21" "0.20" "0.20"
## ALV "0.07" "0.06" "0.07" "0.05" "0.14" "0.10" "0.10" "0.10" "0.10" "0.09"
## CES "0.15" "0.15" "0.15" "0.19" "0.23" "0.19" "0.21" "0.15" "0.18" "0.18"
## ITR "0.04" "0.02" "0.04" "0.06" "0.12" "0.07" "0.07" "0.07" "0.08" "0.07"
## ROM "0.04" "0.02" "0.04" "0.06" "0.13" "0.08" "0.08" "0.08" "0.09" "0.08"
## SLO "0.04" "0.04" "0.04" "0.06" "0.11" "0.07" "0.08" "0.05" "0.07" "0.06"
## ITB "0.06" "0.06" "0.07" "0.10" "0.16" "0.10" "0.13" "0.09" "0.11" "0.11"
## BUL "0.07" "0.06" "0.08" "0.09" "0.15" "0.10" "0.11" "0.10" "0.11" "0.10"
## DES "0.11" "0.11" "0.11" "0.14" "0.18" "0.15" "0.17" "0.10" "0.14" "0.14"
## TRE "0.03" "0.03" "0.03" "0.06" "0.12" "0.07" "0.08" "0.04" "0.07" "0.07"
## BRE "0.12" "0.12" "0.12" "0.15" "0.20" "0.16" "0.18" "0.12" "0.15" "0.15"
## SIC "0.06" "0.05" "0.06" "0.08" "0.14" "0.10" "0.10" "0.07" "0.10" "0.09"
## ROS "0.07" "0.07" "0.08" "0.10" "0.15" "0.11" "0.12" "0.09" "0.11" "0.11"
## IMP "0.05" "0.04" "0.05" "0.09" "0.15" "0.09" "0.11" "0.08" "0.09" "0.09"
## GRA "0.07" "0.05" "0.07" "0.02" "0.13" "0.08" "0.09" "0.09" "0.10" "0.08"
## TUA "0.07" "0.05" "0.07" "0.08" "0.14" "0.09" "0.06" "0.09" "0.10" "0.09"
## FRS NA     "0.04" "0.03" "0.07" "0.12" "0.08" "0.09" "0.06" "0.08" "0.08"
## MAL "1270" NA     "0.04" "0.06" "0.12" "0.07" "0.08" "0.06" "0.08" "0.07"
## STS "412"  "1523" NA     "0.07" "0.13" "0.08" "0.09" "0.07" "0.08" "0.08"
## GRC "1877" "862"  "1975" NA     "0.13" "0.09" "0.10" "0.10" "0.10" "0.09"
## SEV "2180" "1878" "2012" "1288" NA     "0.09" "0.16" "0.15" "0.07" "0.03"
## ALU "2246" "1948" "2072" "1348" "72"   NA     "0.11" "0.10" "0.07" "0.05"
## BAR "512"  "1233" "916"  "2007" "2561" "2631" NA     "0.11" "0.12" "0.11"
## SPM "684"  "1128" "1096" "1944" "2606" "2677" "211"  NA     "0.10" "0.09"
## KER "2391" "2127" "2197" "1517" "250"  "179"  "2789" "2841" NA     "0.03"
## KRA "2596" "2301" "2401" "1648" "438"  "366"  "2992" "3042" "206"  NA    
## SPC "818"  "1368" "1214" "2193" "2843" "2913" "310"  "252"  "3075" "3277"
## SOC "2689" "2316" "2512" "1618" "509"  "443"  "3068" "3107" "325"  "161" 
## TIK "2660" "2414" "2447" "1776" "539"  "467"  "3069" "3126" "290"  "132" 
## GES "2794" "2395" "2620" "1675" "614"  "550"  "3168" "3204" "434"  "257" 
## RAR "2758" "2460" "2555" "1789" "604"  "532"  "3157" "3208" "369"  "166" 
## TUH "2897" "2412" "2745" "1650" "734"  "679"  "3251" "3275" "597"  "443" 
## ARM "3207" "2719" "3045" "1937" "1034" "974"  "3566" "3591" "862"  "674" 
## SPB "1264" "1923" "1601" "2760" "3388" "3457" "827"  "823"  "3614" "3818"
## SPS "1371" "1785" "1750" "2643" "3393" "3464" "874"  "787"  "3628" "3829"
## POP "1225" "2067" "1509" "2876" "3397" "3464" "878"  "939"  "3614" "3819"
##     SPC    SOC    TIK    GES    RAR    TUH    ARM    SPB    SPS    POP   
## CRO "0.10" "0.11" "0.13" "0.12" "0.12" "0.07" "0.08" "0.09" "0.11" "0.06"
## ALD "0.13" "0.14" "0.16" "0.14" "0.15" "0.09" "0.11" "0.12" "0.14" "0.09"
## ITP "0.08" "0.10" "0.12" "0.11" "0.11" "0.04" "0.07" "0.07" "0.09" "0.04"
## TIR "0.11" "0.13" "0.15" "0.14" "0.14" "0.07" "0.09" "0.10" "0.13" "0.07"
## SER "0.22" "0.23" "0.25" "0.23" "0.24" "0.17" "0.19" "0.20" "0.24" "0.17"
## ALV "0.11" "0.12" "0.14" "0.12" "0.13" "0.07" "0.09" "0.10" "0.11" "0.07"
## CES "0.15" "0.21" "0.22" "0.21" "0.22" "0.16" "0.16" "0.13" "0.22" "0.13"
## ITR "0.09" "0.10" "0.12" "0.10" "0.10" "0.03" "0.07" "0.08" "0.07" "0.03"
## ROM "0.10" "0.11" "0.13" "0.11" "0.12" "0.04" "0.08" "0.08" "0.09" "0.04"
## SLO "0.06" "0.09" "0.11" "0.10" "0.10" "0.04" "0.05" "0.05" "0.09" "0.04"
## ITB "0.09" "0.13" "0.16" "0.14" "0.14" "0.08" "0.09" "0.08" "0.13" "0.05"
## BUL "0.10" "0.13" "0.15" "0.13" "0.14" "0.07" "0.10" "0.10" "0.12" "0.07"
## DES "0.10" "0.17" "0.18" "0.17" "0.18" "0.12" "0.11" "0.08" "0.17" "0.08"
## TRE "0.05" "0.10" "0.11" "0.10" "0.11" "0.04" "0.05" "0.04" "0.08" "0.03"
## BRE "0.12" "0.18" "0.19" "0.18" "0.19" "0.13" "0.13" "0.10" "0.18" "0.11"
## SIC "0.09" "0.12" "0.14" "0.12" "0.13" "0.06" "0.08" "0.08" "0.11" "0.05"
## ROS "0.10" "0.13" "0.15" "0.14" "0.14" "0.08" "0.09" "0.09" "0.13" "0.07"
## IMP "0.08" "0.12" "0.15" "0.13" "0.13" "0.06" "0.08" "0.07" "0.10" "0.05"
## GRA "0.10" "0.11" "0.13" "0.11" "0.12" "0.06" "0.08" "0.09" "0.10" "0.07"
## TUA "0.11" "0.12" "0.14" "0.12" "0.12" "0.05" "0.09" "0.10" "0.03" "0.06"
## FRS "0.07" "0.10" "0.12" "0.11" "0.11" "0.05" "0.06" "0.06" "0.10" "0.04"
## MAL "0.07" "0.09" "0.11" "0.10" "0.11" "0.04" "0.06" "0.07" "0.07" "0.03"
## STS "0.07" "0.11" "0.12" "0.11" "0.11" "0.05" "0.07" "0.06" "0.10" "0.04"
## GRC "0.11" "0.12" "0.13" "0.12" "0.12" "0.07" "0.08" "0.09" "0.11" "0.07"
## SEV "0.15" "0.04" "0.08" "0.05" "0.05" "0.12" "0.07" "0.14" "0.17" "0.12"
## ALU "0.11" "0.07" "0.09" "0.09" "0.08" "0.08" "0.06" "0.10" "0.13" "0.08"
## BAR "0.13" "0.14" "0.15" "0.14" "0.14" "0.04" "0.11" "0.12" "0.11" "0.09"
## SPM "0.03" "0.12" "0.14" "0.13" "0.13" "0.07" "0.07" "0.01" "0.08" "0.05"
## KER "0.10" "0.05" "0.07" "0.05" "0.05" "0.09" "0.04" "0.09" "0.13" "0.07"
## KRA "0.10" "0.01" "0.04" "0.02" "0.01" "0.08" "0.02" "0.09" "0.13" "0.07"
## SPC NA     "0.13" "0.15" "0.13" "0.14" "0.08" "0.07" "0.01" "0.14" "0.05"
## SOC "3346" NA     "0.06" "0.01" "0.02" "0.10" "0.05" "0.12" "0.15" "0.10"
## TIK "3358" "253"  NA     "0.05" "0.05" "0.12" "0.07" "0.14" "0.17" "0.12"
## GES "3445" "109"  "315"  NA     "0.03" "0.11" "0.06" "0.12" "0.16" "0.10"
## RAR "3442" "187"  "127"  "210"  NA     "0.11" "0.05" "0.13" "0.16" "0.11"
## TUH "3519" "283"  "508"  "194"  "398"  NA     "0.07" "0.08" "0.09" "0.05"
## ARM "3835" "538"  "687"  "429"  "560"  "316"  NA     "0.06" "0.12" "0.05"
## SPB "571"  "3895" "3893" "3995" "3983" "4076" "4392" NA     "0.13" "0.05"
## SPS "567"  "3892" "3913" "3988" "3995" "4054" "4370" "332"  NA     "0.09"
## POP "702"  "3906" "3885" "4010" "3981" "4104" "4417" "283"  "615"  NA

Make a table and save it as a word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]

merged_df1 <- as.data.frame(merged_df)
write.csv(merged_df1, "/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/fst/merged_df_MAF1.csv")

# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df)) 

ft

Population

CRO

ALD

ITP

TIR

SER

ALV

CES

ITR

ROM

SLO

ITB

BUL

DES

TRE

BRE

SIC

ROS

IMP

GRA

TUA

FRS

MAL

STS

GRC

SEV

ALU

BAR

SPM

KER

KRA

SPC

SOC

TIK

GES

RAR

TUH

ARM

SPB

SPS

POP

CRO

0.05

0.05

0.02

0.17

0.03

0.18

0.05

0.05

0.06

0.09

0.09

0.13

0.05

0.15

0.08

0.10

0.08

0.06

0.07

0.06

0.06

0.06

0.05

0.14

0.09

0.09

0.09

0.10

0.09

0.10

0.11

0.13

0.12

0.12

0.07

0.08

0.09

0.11

0.06

ALD

180

0.08

0.03

0.22

0.02

0.20

0.08

0.10

0.08

0.12

0.11

0.16

0.08

0.17

0.11

0.12

0.11

0.08

0.10

0.09

0.08

0.10

0.07

0.16

0.12

0.12

0.11

0.12

0.11

0.13

0.14

0.16

0.14

0.15

0.09

0.11

0.12

0.14

0.09

ITP

201

224

0.06

0.17

0.06

0.16

0.02

0.02

0.04

0.07

0.06

0.11

0.03

0.13

0.04

0.08

0.05

0.05

0.06

0.04

0.02

0.04

0.06

0.12

0.08

0.08

0.06

0.08

0.07

0.08

0.10

0.12

0.11

0.11

0.04

0.07

0.07

0.09

0.04

TIR

196

27

251

0.22

0.00

0.21

0.06

0.07

0.06

0.11

0.10

0.15

0.06

0.17

0.09

0.12

0.10

0.06

0.08

0.07

0.07

0.08

0.06

0.15

0.10

0.10

0.11

0.11

0.10

0.11

0.13

0.15

0.14

0.14

0.07

0.09

0.10

0.13

0.07

SER

322

442

520

439

0.18

0.28

0.16

0.21

0.15

0.22

0.20

0.23

0.15

0.25

0.19

0.20

0.23

0.19

0.20

0.16

0.16

0.18

0.19

0.25

0.20

0.22

0.21

0.20

0.20

0.22

0.23

0.25

0.23

0.24

0.17

0.19

0.20

0.24

0.17

ALV

260

93

235

99

534

0.18

0.06

0.07

0.06

0.10

0.09

0.14

0.06

0.16

0.09

0.10

0.08

0.06

0.08

0.07

0.06

0.07

0.05

0.14

0.10

0.10

0.10

0.10

0.09

0.11

0.12

0.14

0.12

0.13

0.07

0.09

0.10

0.11

0.07

CES

514

673

506

696

612

724

0.17

0.19

0.14

0.17

0.19

0.14

0.12

0.16

0.16

0.17

0.18

0.18

0.19

0.15

0.15

0.15

0.19

0.23

0.19

0.21

0.15

0.18

0.18

0.15

0.21

0.22

0.21

0.22

0.16

0.16

0.13

0.22

0.13

ITR

477

585

371

612

697

605

252

0.01

0.04

0.07

0.05

0.13

0.04

0.14

0.05

0.08

0.05

0.05

0.04

0.04

0.02

0.04

0.06

0.12

0.07

0.07

0.07

0.08

0.07

0.09

0.10

0.12

0.10

0.10

0.03

0.07

0.08

0.07

0.03

ROM

477

585

371

612

697

605

252

0

0.04

0.08

0.07

0.14

0.03

0.16

0.06

0.09

0.06

0.05

0.05

0.04

0.02

0.04

0.06

0.13

0.08

0.08

0.08

0.09

0.08

0.10

0.11

0.13

0.11

0.12

0.04

0.08

0.08

0.09

0.04

SLO

509

689

585

705

476

761

227

455

455

0.06

0.07

0.08

0.02

0.10

0.05

0.06

0.05

0.05

0.06

0.04

0.04

0.04

0.06

0.11

0.07

0.08

0.05

0.07

0.06

0.06

0.09

0.11

0.10

0.10

0.04

0.05

0.05

0.09

0.04

ITB

591

752

583

775

672

802

79

302

302

245

0.10

0.11

0.04

0.12

0.08

0.10

0.08

0.10

0.10

0.06

0.06

0.07

0.10

0.16

0.10

0.13

0.09

0.11

0.11

0.09

0.13

0.16

0.14

0.14

0.08

0.09

0.08

0.13

0.05

BUL

428

414

604

393

316

483

881

899

899

782

950

0.14

0.06

0.15

0.08

0.10

0.09

0.09

0.08

0.07

0.06

0.08

0.09

0.15

0.10

0.11

0.10

0.11

0.10

0.10

0.13

0.15

0.13

0.14

0.07

0.10

0.10

0.12

0.07

DES

691

859

702

881

725

915

198

427

427

255

126

1021

0.07

0.09

0.09

0.11

0.12

0.14

0.14

0.11

0.11

0.11

0.14

0.18

0.15

0.17

0.10

0.14

0.14

0.10

0.17

0.18

0.17

0.18

0.12

0.11

0.08

0.17

0.08

TRE

684

858

717

877

683

920

230

476

476

206

176

986

80

0.09

0.04

0.05

0.04

0.06

0.06

0.03

0.03

0.03

0.06

0.12

0.07

0.08

0.04

0.07

0.07

0.05

0.10

0.11

0.10

0.11

0.04

0.05

0.04

0.08

0.03

BRE

719

887

728

908

752

942

223

445

445

280

148

1048

28

92

0.12

0.14

0.13

0.15

0.16

0.12

0.12

0.12

0.15

0.20

0.16

0.18

0.12

0.15

0.15

0.12

0.18

0.19

0.18

0.19

0.13

0.13

0.10

0.18

0.11

SIC

537

480

340

502

858

420

715

483

483

865

779

894

905

945

925

0.09

0.07

0.08

0.08

0.06

0.05

0.06

0.08

0.14

0.10

0.10

0.07

0.10

0.09

0.09

0.12

0.14

0.12

0.13

0.06

0.08

0.08

0.11

0.05

ROS

683

771

884

761

367

859

917

1047

1047

726

961

445

977

915

1000

1219

0.09

0.10

0.10

0.07

0.07

0.08

0.10

0.15

0.11

0.12

0.09

0.11

0.11

0.10

0.13

0.15

0.14

0.14

0.08

0.09

0.09

0.13

0.07

IMP

840

984

787

1009

949

1020

341

428

428

507

277

1221

268

346

254

891

1232

0.08

0.08

0.05

0.04

0.05

0.09

0.15

0.09

0.11

0.08

0.09

0.09

0.08

0.12

0.15

0.13

0.13

0.06

0.08

0.07

0.10

0.05

GRA

711

534

705

515

884

476

1200

1070

1070

1220

1279

656

1390

1391

1417

736

1100

1492

0.07

0.07

0.05

0.07

0.02

0.13

0.08

0.09

0.09

0.10

0.08

0.10

0.11

0.13

0.11

0.12

0.06

0.08

0.09

0.10

0.07

TUA

851

694

901

669

932

667

1363

1271

1271

1340

1441

641

1542

1529

1569

994

1057

1678

277

0.07

0.05

0.07

0.08

0.14

0.09

0.06

0.09

0.10

0.09

0.11

0.12

0.14

0.12

0.12

0.05

0.09

0.10

0.03

0.06

FRS

1038

1192

1004

1216

1100

1234

525

654

654

629

448

1393

375

428

349

1119

1341

228

1709

1885

0.04

0.03

0.07

0.12

0.08

0.09

0.06

0.08

0.08

0.07

0.10

0.12

0.11

0.11

0.05

0.06

0.06

0.10

0.04

MAL

817

746

621

764

1139

674

943

694

694

1118

996

1157

1119

1170

1135

280

1498

1049

873

1148

1270

0.04

0.06

0.12

0.07

0.08

0.06

0.08

0.07

0.07

0.09

0.11

0.10

0.11

0.04

0.06

0.07

0.07

0.03

STS

1053

1230

1098

1248

989

1297

604

835

835

546

536

1304

410

381

390

1314

1125

528

1764

1886

412

1523

0.07

0.13

0.08

0.09

0.07

0.08

0.08

0.07

0.11

0.12

0.11

0.11

0.05

0.07

0.06

0.10

0.04

GRC

934

754

884

742

1141

679

1389

1224

1224

1440

1467

925

1585

1596

1611

812

1370

1653

270

445

1877

862

1975

0.13

0.09

0.10

0.10

0.10

0.09

0.11

0.12

0.13

0.12

0.12

0.07

0.08

0.09

0.11

0.07

SEV

1250

1196

1410

1170

1082

1235

1689

1723

1723

1551

1753

824

1804

1756

1830

1652

894

2029

1086

843

2180

1878

2012

1288

0.09

0.16

0.15

0.07

0.03

0.15

0.04

0.08

0.05

0.05

0.12

0.07

0.14

0.17

0.12

ALU

1321

1268

1482

1242

1149

1307

1757

1794

1794

1617

1821

895

1870

1821

1896

1724

950

2097

1151

904

2246

1948

2072

1348

72

0.11

0.10

0.07

0.05

0.11

0.07

0.09

0.09

0.08

0.08

0.06

0.10

0.13

0.08

BAR

1333

1446

1227

1473

1489

1458

878

862

862

1059

824

1742

815

887

796

1195

1783

552

1900

2123

512

1233

916

2007

2561

2631

0.11

0.12

0.11

0.13

0.14

0.15

0.14

0.14

0.04

0.11

0.12

0.11

0.09

SPM

1360

1450

1227

1478

1555

1448

956

883

883

1160

916

1782

933

1011

919

1136

1873

665

1864

2103

684

1128

1096

1944

2606

2677

211

0.10

0.09

0.03

0.12

0.14

0.13

0.13

0.07

0.07

0.01

0.08

0.05

KER

1490

1443

1655

1417

1301

1484

1913

1960

1960

1762

1973

1062

2017

1964

2042

1901

1073

2250

1327

1075

2391

2127

2197

1517

250

179

2789

2841

0.03

0.10

0.05

0.07

0.05

0.05

0.09

0.04

0.09

0.13

0.07

KRA

1687

1633

1848

1607

1506

1670

2117

2160

2160

1968

2178

1261

2222

2170

2248

2084

1276

2455

1478

1214

2596

2301

2401

1648

438

366

2992

3042

206

0.10

0.01

0.04

0.02

0.01

0.08

0.02

0.09

0.13

0.07

SPC

1601

1698

1474

1725

1781

1698

1173

1124

1124

1363

1125

2020

1122

1196

1104

1387

2086

857

2116

2354

818

1368

1214

2193

2843

2913

310

252

3075

3277

0.13

0.15

0.13

0.14

0.08

0.07

0.01

0.14

0.05

SOC

1748

1678

1898

1651

1591

1706

2197

2224

2224

2060

2262

1327

2313

2264

2339

2113

1388

2537

1470

1198

2689

2316

2512

1618

509

443

3068

3107

325

161

3346

0.06

0.01

0.02

0.10

0.05

0.12

0.15

0.10

TIK

1778

1733

1945

1707

1580

1774

2191

2247

2247

2033

2250

1350

2288

2233

2313

2190

1328

2527

1600

1339

2660

2414

2447

1776

539

467

3069

3126

290

132

3358

253

0.05

0.05

0.12

0.07

0.14

0.17

0.12

GES

1845

1770

1991

1743

1695

1795

2300

2321

2321

2166

2365

1426

2418

2370

2444

2197

1496

2640

1540

1265

2794

2395

2620

1675

614

550

3168

3204

434

257

3445

109

315

0.03

0.11

0.06

0.12

0.16

0.10

RAR

1853

1798

2014

1772

1670

1833

2281

2326

2326

2130

2342

1427

2385

2331

2410

2246

1432

2619

1629

1361

2758

2460

2555

1789

604

532

3157

3208

369

166

3442

187

127

210

0.11

0.05

0.13

0.16

0.11

TUH

1919

1828

2051

1801

1797

1844

2393

2395

2395

2272

2461

1512

2522

2478

2549

2233

1629

2734

1544

1267

2897

2412

2745

1650

734

679

3251

3275

597

443

3519

283

508

194

398

0.07

0.08

0.09

0.05

ARM

2235

2144

2367

2117

2107

2159

2706

2711

2711

2581

2773

1826

2832

2787

2859

2545

1923

3047

1848

1571

3207

2719

3045

1937

1034

974

3566

3591

862

674

3835

538

687

429

560

316

0.06

0.12

0.05

SPB

2158

2263

2040

2290

2313

2266

1702

1683

1683

1869

1644

2569

1617

1682

1595

1957

2594

1367

2687

2925

1264

1923

1601

2760

3388

3457

827

823

3614

3818

571

3895

3893

3995

3983

4076

4392

0.13

0.05

SPS

2144

2226

2002

2253

2341

2215

1737

1669

1669

1930

1690

2568

1689

1761

1670

1865

2651

1424

2601

2855

1371

1785

1750

2643

3393

3464

874

787

3628

3829

567

3892

3913

3988

3995

4054

4370

332

0.09

POP

2197

2320

2103

2347

2316

2335

1711

1734

1734

1852

1645

2591

1597

1653

1572

2065

2566

1370

2777

3001

1225

2067

1509

2876

3397

3464

878

939

3614

3819

702

3906

3885

4010

3981

4104

4417

283

615

2. Mantel test with Europe Set 3

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/pops_4bfst.txt \
--make-bed \
--out output/fst/mantel_MAF1 \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel_MAF1.log

20968 variants loaded from .bim file. –keep-fam: 408 people remaining. Total genotyping rate in remaining samples is 0.971028. 20968 variants and 408 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel_MAF1 \
--recodeA \
--out output/fst/mantel_MAF1 \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel_MAF1.log

20968 variants loaded from .bim file. 20968 variants and 408 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel_MAF1.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "output", "europe", "fst", "europe_MAF1.rds"
))

Load it

albo2 <- readRDS(here(
  "output", "europe", "fst", "europe_MAF1.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel_MAF1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      SOC         1065          0          0   0        -9    Sochi  Russia
## 2      SOC         1066          0          0   0        -9    Sochi  Russia
## 3      SOC         1067          0          0   0        -9    Sochi  Russia
## 4      SOC         1068          0          0   0        -9    Sochi  Russia
## 5      SOC         1069          0          0   0        -9    Sochi  Russia
## 6      SOC         1070          0          0   0        -9    Sochi  Russia
##   Latitude Longitude Continent Year         Region   Subregion order order2
## 1 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 2 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 3 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 4 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 5 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 6 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
##   orderold
## 1       38
## 2       38
## 3       38
## 4       38
## 5       38
## 6       38

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 43.60042 39.74533
## [2,] 43.60042 39.74533
## [3,] 43.60042 39.74533
## [4,] 43.60042 39.74533
## [5,] 43.60042 39.74533
## [6,] 43.60042 39.74533
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 43.59856 39.74459
## [2,] 43.59977 39.74418
## [3,] 43.59923 39.74627
## [4,] 43.59845 39.74716
## [5,] 43.60109 39.74391
## [6,] 43.60151 39.74429

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "output", "europe", "fst", "europe_MAF1.rds"
  )
)

2.1 Isolation By Distance for Europe (Set 3)

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: -0.01144286 
## 
## Based on 999 replicates
## Simulated p-value: 0.533 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## -0.141786675  0.001207842  0.007960831

Simulated p-value is not statistically signficant. Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance.

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "simIBD_europe_MAF1.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "Genetic_v_Geog_distance_europe_MAF_1.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
#  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "output", "europe", "fst", "IDB_PlotFromMASS_density_MAF1.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
 # colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()
png(here("output", "europe", "fst", "ibd2_MAF1.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()

3. Subsets by Region for Set 3

Subset by region

Add subset column

iberia <- c("Spain", "Portugal")
italy <- c("Italy")
central <- c("Albania", "Croatia", "Greece")
east <- c("Armenia", "Georgia", "Russia", "Ukraine")
other <- c("Bulgaria", "France", "Malta", "Romania", "Serbia", "Slovenia", "Turkey")

library(dplyr)
filtered_sampling_loc2 <- mutate(filtered_sampling_loc, subset_cat = factor(case_when(Country %in% iberia ~ "Iberian Peninsula", Country %in% italy ~ "Italy", Country %in% central ~ "Albania, Croatia, Greece", Country %in% east ~ "Eastern", Country %in% other ~ "Central", TRUE ~ NA_character_)))

Select subsets

countries_for_subsets <- filtered_sampling_loc2 %>%
  group_by(subset_cat) %>%
  filter(subset_cat !="Central") %>% 
  pull(subset_cat) %>%
  unique()
countries_for_subsets
## [1] Iberian Peninsula        Italy                    Albania, Croatia, Greece
## [4] Eastern                 
## Levels: Albania, Croatia, Greece Central Eastern Iberian Peninsula Italy

results <- list()

for (country in countries_with_3_pops) { # Extract abbreviations for the country abbreviations <- filtered_sampling_loc %>% filter(Country == country) %>% pull(Abbreviation)

# Subset the data subset_data <- data %>% filter(row_index %in% abbreviations & col_index %in% abbreviations)

# Perform linear regression lm_model <- lm(FST ~ Distance, data = subset_data) results[[country]] <- list( equation = sprintf(“y = %.5fx + %.3f”, coef(lm_model)[2], coef(lm_model)[1]), r2 = sprintf(“R^2 = %.2f”, summary(lm_model)$r.squared) ) }

results Do test for each country

results <- list()

for (subset_cat in countries_for_subsets) {
  # Extract abbreviations for the country
  abbreviations <- filtered_sampling_loc2 %>%
    filter(subset_cat == subset_cat) %>%
    pull(Abbreviation)
  
  # Subset the data
  subset_data <- data %>%
    filter(row_index %in% abbreviations & col_index %in% abbreviations)
  
  # Perform linear regression
  lm_model <- lm(FST ~ Distance, data = subset_data)
  results[[subset_cat]] <- list(
    equation = sprintf("y = %.5fx + %.3f", coef(lm_model)[2], coef(lm_model)[1]),
    r2 = sprintf("R^2 = %.2f", summary(lm_model)$r.squared)
  )
}

results
## $`Iberian Peninsula`
## $`Iberian Peninsula`$equation
## [1] "y = 0.00001x + 0.088"
## 
## $`Iberian Peninsula`$r2
## [1] "R^2 = 0.03"
## 
## 
## $Italy
## $Italy$equation
## [1] "y = 0.00001x + 0.088"
## 
## $Italy$r2
## [1] "R^2 = 0.03"
## 
## 
## $`Albania, Croatia, Greece`
## $`Albania, Croatia, Greece`$equation
## [1] "y = 0.00001x + 0.088"
## 
## $`Albania, Croatia, Greece`$r2
## [1] "R^2 = 0.03"
## 
## 
## $Eastern
## $Eastern$equation
## [1] "y = 0.00001x + 0.088"
## 
## $Eastern$r2
## [1] "R^2 = 0.03"

Merge the data

data_merged <- data %>%
  left_join(filtered_sampling_loc2[, c("Pop_City", "subset_cat", "Abbreviation")], by = c("row_index" = "Abbreviation")) %>%
  rename(Country1 = subset_cat) %>%
  left_join(filtered_sampling_loc2[, c("Pop_City", "subset_cat", "Abbreviation")], by = c("col_index" = "Abbreviation")) %>%
  dplyr::select(-Pop_City.x, -Pop_City.y) %>%
  filter(Country1 == subset_cat)  # Ensures the data is within the same country


# Filter to get the coutries with 3 or more sampling localities
to_include <- c("Italy", "Albania, Croatia, Greece", "Eastern", "Iberian Peninsula")

# Filter
data_filtered <- data_merged %>%
  group_by(Country1) %>%
  filter(n() >= 3 & Country1 %in% to_include) %>%
  ungroup()

Calculate linear regression for each country

regression_results <- data_filtered %>%
  group_by(Country1) %>%
  do(model = lm(FST ~ Distance, data = .)) %>%
  rowwise() %>%
  mutate(equation = sprintf("y = %.3fx + %.3f", coef(model)[2], coef(model)[1]),   r2 = sprintf("R^2 = %.2f", summary(model)$r.squared))

Plot it

ggplot(data_filtered, aes(x = Distance, y = FST)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ Country1, scales = "free", ncol = 2) +
  geom_text(data = regression_results, aes(label = paste(equation, r2, sep = "\n"), x = Inf, y = Inf), 
            vjust = 2, hjust = 2, size = 3.5, inherit.aes = FALSE) +
  labs(title = "FST vs Distance by Region",
       x = "Distance",
       y = "FST") +
  scale_x_continuous(labels = scales::comma) + 
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "r_01", "fst_by_distance_Europe_subsets_r01_LD2.pdf"),
  width = 6,
  height = 8,
  units = "in"
)
module load PLINK/1.9b_6.21-x86_64

4. IBD for subsets

4.1 Iberian Peninsula Mantel test

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/subsets/pops_iberia.txt \
--make-bed \
--out output/fst/subsets/mantel_iberia \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_iberia.log

20968 variants loaded from .bim file. –keep-fam: 52 people remaining. Total genotyping rate in remaining samples is 0.974247. 20968 variants and 52 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/subsets/mantel_iberia \
--recodeA \
--out output/fst/subsets/mantel_iberia \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_iberia.log

20968 variants loaded from .bim file. 20968 variants and 52 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_iberia.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/iberia_MAF1.rds"
))

Load it

albo2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/iberia_MAF1.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_iberia.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      BAR          279          0          0   0        -9
## 2      BAR          280          0          0   0        -9
## 3      BAR          281          0          0   0        -9
## 4      BAR          282          0          0   0        -9
## 5      BAR          283          0          0   0        -9
## 6      BAR          284          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype  Pop_City Country
## 1      BAR          279          0          0   0        -9 Barcelona   Spain
## 2      BAR          280          0          0   0        -9 Barcelona   Spain
## 3      BAR          281          0          0   0        -9 Barcelona   Spain
## 4      BAR          282          0          0   0        -9 Barcelona   Spain
## 5      BAR          283          0          0   0        -9 Barcelona   Spain
## 6      BAR          284          0          0   0        -9 Barcelona   Spain
##   Latitude Longitude Continent Year          Region   Subregion order order2
## 1  41.3851    2.1734    Europe 2018 Southern Europe West Europe    16      8
## 2  41.3851    2.1734    Europe 2018 Southern Europe West Europe    16      8
## 3  41.3851    2.1734    Europe 2018 Southern Europe West Europe    16      8
## 4  41.3851    2.1734    Europe 2018 Southern Europe West Europe    16      8
## 5  41.3851    2.1734    Europe 2018 Southern Europe West Europe    16      8
## 6  41.3851    2.1734    Europe 2018 Southern Europe West Europe    16      8
##   orderold
## 1        8
## 2        8
## 3        8
## 4        8
## 5        8
## 6        8

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##         [,1]   [,2]
## [1,] 41.3851 2.1734
## [2,] 41.3851 2.1734
## [3,] 41.3851 2.1734
## [4,] 41.3851 2.1734
## [5,] 41.3851 2.1734
## [6,] 41.3851 2.1734
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 41.38322 2.154412
## [2,] 41.37141 2.194045
## [3,] 41.37607 2.173582
## [4,] 41.39055 2.159370
## [5,] 41.39245 2.175299
## [6,] 41.39631 2.163812

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/iberia_albo2.rds"
  )
)

4.1.1 Isolation By Distance for Iberia (Set 3)

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: 0.01596436 
## 
## Based on 999 replicates
## Simulated p-value: 0.41 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
##  0.104039999 -0.006539113  0.046784121

Observation: 0.01440828
Simulated p-value (0.434) is not statistically signficant. Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance. Std.Obs Expectation Variance 0.049392821 0.003712072 0.046895623

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "simIBD_iberia_MAF1.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "Genetic_v_Geog_distance_iberia_MAF_1.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
#  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "output", "europe", "fst", "subsets", "IDB_PlotFromMASS_density_iberia_MAF1.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
 # colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()
png(here("output", "europe", "fst", "subsets", "ibd2_iberia_MAF1.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()

4.2 Albania, Croatia, Greece Mantel test

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/subsets/pops_acg.txt \
--make-bed \
--out output/fst/subsets/mantel_acg \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_acg.log

20968 variants loaded from .bim file. –keep-fam: 59 people remaining. Total genotyping rate in remaining samples is 0.973622. 20968 variants and 59 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/subsets/mantel_acg \
--recodeA \
--out output/fst/subsets/mantel_acg \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_acg.log

20968 variants loaded from .bim file. 20968 variants and 59 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_acg.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/agc_MAF1.rds"
))

Load it

albo2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/agc_MAF1.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_acg.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      TIR          193          0          0   0        -9
## 2      TIR          194          0          0   0        -9
## 3      TIR          195          0          0   0        -9
## 4      TIR          196          0          0   0        -9
## 5      GRC         2202          0          0   1        -9
## 6      CRO          711          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype  Pop_City Country
## 1      TIR          193          0          0   0        -9    Tirana Albania
## 2      TIR          194          0          0   0        -9    Tirana Albania
## 3      TIR          195          0          0   0        -9    Tirana Albania
## 4      TIR          196          0          0   0        -9    Tirana Albania
## 5      GRC         2202          0          0   1        -9    Chania  Greece
## 6      CRO          711          0          0   0        -9 Dubrovnik Croatia
##   Latitude Longitude Continent Year          Region   Subregion order order2
## 1 41.31473  19.83172    Europe 2017 Southern Europe East Europe    34     26
## 2 41.31473  19.83172    Europe 2017 Southern Europe East Europe    34     26
## 3 41.31473  19.83172    Europe 2017 Southern Europe East Europe    34     26
## 4 41.31473  19.83172    Europe 2017 Southern Europe East Europe    34     26
## 5 35.51448  24.01796    Europe 2019 Southern Europe East Europe    37     29
## 6 42.60654  18.22661    Europe 2017 Southern Europe East Europe    31     23
##   orderold
## 1       26
## 2       26
## 3       26
## 4       26
## 5       29
## 6       23

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 41.31473 19.83172
## [2,] 41.31473 19.83172
## [3,] 41.31473 19.83172
## [4,] 41.31473 19.83172
## [5,] 35.51448 24.01796
## [6,] 42.60654 18.22661
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 41.31548 19.83117
## [2,] 41.31652 19.83194
## [3,] 41.31406 19.83051
## [4,] 41.31448 19.83003
## [5,] 35.51458 24.01847
## [6,] 42.60811 18.22540

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/acg_albo2.rds"
  )
)

4.2.1 Isolation By Distance for Albania, Croatia, Greece (Set 3)

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: -0.4976328 
## 
## Based on 999 replicates
## Simulated p-value: 0.985 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## -1.699080884 -0.001331756  0.085322243

Observation: -0.4978599
Simulated p-value (0.991) is not statistically signficant. Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance. Std.Obs Expectation Variance -1.688754683 0.007006367 0.089375735

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "simIBD_acg_MAF1.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "Genetic_v_Geog_distance_acg_MAF_1.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
#  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "output", "europe", "fst", "subsets", "IDB_PlotFromMASS_density_acg_MAF1.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
 # colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()
png(here("output", "europe", "fst", "subsets", "ibd2_acg_MAF1.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()

4.3 Italy Mantel test

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/subsets/pops_italy.txt \
--make-bed \
--out output/fst/subsets/mantel_italy \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_italy.log

20968 variants loaded from .bim file. –keep-fam: 97 people remaining. Total genotyping rate in remaining samples is 0.959471. 20968 variants and 97 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/subsets/mantel_italy \
--recodeA \
--out output/fst/subsets/mantel_italy \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_italy.log

20968 variants loaded from .bim file. 20968 variants and 97 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_italy.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/italy_MAF1.rds"
))

Load it

albo2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/italy_MAF1.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_italy.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      TRE         1185          0          0   0        -9
## 2      TRE         1186          0          0   0        -9
## 3      TRE         1187          0          0   0        -9
## 4      TRE         1188          0          0   0        -9
## 5      TRE         1189          0          0   2        -9
## 6      TRE         1190          0          0   2        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      TRE         1185          0          0   0        -9 Trentino   Italy
## 2      TRE         1186          0          0   0        -9 Trentino   Italy
## 3      TRE         1187          0          0   0        -9 Trentino   Italy
## 4      TRE         1188          0          0   0        -9 Trentino   Italy
## 5      TRE         1189          0          0   2        -9 Trentino   Italy
## 6      TRE         1190          0          0   2        -9 Trentino   Italy
##   Latitude Longitude Continent Year          Region   Subregion order order2
## 1 46.05917  11.11722    Europe 2020 Southern Europe West Europe    22     14
## 2 46.05917  11.11722    Europe 2020 Southern Europe West Europe    22     14
## 3 46.05917  11.11722    Europe 2020 Southern Europe West Europe    22     14
## 4 46.05917  11.11722    Europe 2020 Southern Europe West Europe    22     14
## 5 46.05917  11.11722    Europe 2020 Southern Europe West Europe    22     14
## 6 46.05917  11.11722    Europe 2020 Southern Europe West Europe    22     14
##   orderold
## 1       14
## 2       14
## 3       14
## 4       14
## 5       14
## 6       14

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 46.05917 11.11722
## [2,] 46.05917 11.11722
## [3,] 46.05917 11.11722
## [4,] 46.05917 11.11722
## [5,] 46.05917 11.11722
## [6,] 46.05917 11.11722
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 46.05617 11.10367
## [2,] 46.04763 11.11973
## [3,] 46.05780 11.12035
## [4,] 46.05396 11.10678
## [5,] 46.05544 11.11029
## [6,] 46.07178 11.10345

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/italy_albo2.rds"
  )
)

4.3.1 Isolation By Distance for Italy (Set 3)

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: 0.03952138 
## 
## Based on 999 replicates
## Simulated p-value: 0.456 
## Alternative hypothesis: greater 
## 
##     Std.Obs Expectation    Variance 
##  0.12647946  0.01022908  0.05363730

Observation: 0.03899139
Simulated p-value (0.431) is not statistically signficant. Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance. Std.Obs Expectation Variance 0.161017859 0.001150481 0.055229986

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "simIBD_italy_MAF1.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "Genetic_v_Geog_distance_italy_MAF_1.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
#  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "output", "europe", "fst", "subsets", "IDB_PlotFromMASS_density_italy_MAF1.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
 # colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()
png(here("output", "europe", "fst", "subsets", "ibd2_italy_MAF1.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()

4.4 Eastern Europe Mantel test

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/subsets/pops_east.txt \
--make-bed \
--out output/fst/subsets/mantel_east \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_east.log

20968 variants loaded from .bim file. –keep-fam: 106 people remaining. Total genotyping rate in remaining samples is 0.976328. 20968 variants and 106 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/subsets/mantel_east \
--recodeA \
--out output/fst/subsets/mantel_east \
--silent;
grep 'samples\|variants\|remaining' output/fst/subsets/mantel_east.log

20968 variants loaded from .bim file. 20968 variants and 106 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_east.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/east_MAF1.rds"
))

Load it

albo2 <- readRDS(here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/east_MAF1.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/mantel_east.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      SOC         1065          0          0   0        -9    Sochi  Russia
## 2      SOC         1066          0          0   0        -9    Sochi  Russia
## 3      SOC         1067          0          0   0        -9    Sochi  Russia
## 4      SOC         1068          0          0   0        -9    Sochi  Russia
## 5      SOC         1069          0          0   0        -9    Sochi  Russia
## 6      SOC         1070          0          0   0        -9    Sochi  Russia
##   Latitude Longitude Continent Year         Region   Subregion order order2
## 1 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 2 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 3 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 4 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 5 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
## 6 43.60042  39.74533    Europe 2021 Eastern Europe East Europe    46     38
##   orderold
## 1       38
## 2       38
## 3       38
## 4       38
## 5       38
## 6       38

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 43.60042 39.74533
## [2,] 43.60042 39.74533
## [3,] 43.60042 39.74533
## [4,] 43.60042 39.74533
## [5,] 43.60042 39.74533
## [6,] 43.60042 39.74533
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 43.60005 39.74616
## [2,] 43.60138 39.74376
## [3,] 43.60235 39.74629
## [4,] 43.60164 39.74395
## [5,] 43.60035 39.74344
## [6,] 43.60231 39.74547

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/subsets/east_albo2.rds"
  )
)

4.4.1 Isolation By Distance for Eastern Europe (Set 3)

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: 0.149561 
## 
## Based on 999 replicates
## Simulated p-value: 0.296 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
##  0.591579962 -0.004727425  0.068020487

Observation: 0.1495636
Simulated p-value (0.325) is not statistically signficant. Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance. Std.Obs Expectation Variance 0.55289454 0.00886304 0.06476002

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "simIBD_east_MAF1.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "output", "europe", "fst", "subsets", "Genetic_v_Geog_distance_east_MAF_1.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
#  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "output", "europe", "fst", "subsets", "IDB_PlotFromMASS_density_east_MAF1_equ.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
 # colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()
png(here("output", "europe", "fst", "subsets", "ibd2_east_MAF1.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()

5. Comparison of the 3 SNP Set

# Remove all objects from the environment
rm(list = ls())
# Run the garbage collector to free up memory
gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  6350564 339.2   19014132 1015.5  23767665 1269.4
## Vcells 14639377 111.7  198256644 1512.6 309601531 2362.1

Import the data

Set 1

Set1 <- readRDS(
  here(
"output", "europe", "fst", "r_01", "LD2_country_europe_r01.rds"
  )
)
head(Set1)
## # A tibble: 6 × 2
##   Country  mean_fst
##   <chr>       <dbl>
## 1 Albania      0.1 
## 2 Armenia      0.08
## 3 Bulgaria     0.1 
## 4 Croatia      0.09
## 5 France       0.08
## 6 Georgia      0.11

Set 2

Set2 <- readRDS(
  here(
"output", "europe", "fst", "LD2_country.rds"
  )
)
head(Set2)
## # A tibble: 6 × 2
##   Country  mean_fst
##   <chr>       <dbl>
## 1 Albania      0.1 
## 2 Armenia      0.08
## 3 Bulgaria     0.1 
## 4 Croatia      0.09
## 5 France       0.08
## 6 Georgia      0.11

Set 3

Set3 <- readRDS(
  here(
  "output", "europe", "fst", "fst_country_MAF1.rds"
  )
)
head(Set3)
## # A tibble: 6 × 2
##   Country  mean_fst
##   <chr>       <dbl>
## 1 Albania      0.1 
## 2 Armenia      0.08
## 3 Bulgaria     0.1 
## 4 Croatia      0.09
## 5 France       0.08
## 6 Georgia      0.11

Merge the datasets

# Add an identifier column to each dataset
Set1$dataset <- "Set1"
Set2$dataset <- "Set2"
Set3$dataset <- "Set3"

# Merge the datasets
combined_data <- rbind(Set1, Set2, Set3)

# Assign a numeric index for plotting based on ordered countries
combined_data <- combined_data %>% 
  arrange(Country) %>%
  group_by(Country) %>%
  mutate(index = dense_rank(Country))

# Custom color mapping
color_mapping <- scale_color_manual(
  values = c(Set1 = "green3", Set2 = "blue", Set3 = "orange"),
  name = "Dataset"
)


# Shapes
shape_mapping <- scale_shape_manual(
  values = c(Set1 = 16, Set2 = 17, Set3 = 18),
  name = "Dataset"
)


# Scatter plot with custom colors and shapes
plot <- ggplot(combined_data, aes(x = Country, y = mean_fst, color = dataset, shape = dataset)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = FALSE, aes(group = dataset)) +  # Fitted line per dataset
  labs(
    title = "",
    x = "Countries",
    y = "Mean Fst"
  ) +
  color_mapping +
  shape_mapping +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "top")

# Display the plot
print(plot)
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("output", "europe", "fst", "fst_datasets_comparison.pdf"),
  plot, 
  width = 7,
  height = 5,
  units = "in"
)

6. Re-run Fst for same subset of samples with SNPs and microsats

Create list SNP pops with at least 4 individuals shared with microsats

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

awk '{print $1}' output/snps_sets/r2_0.01_b.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4bfst.txt;
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
head  output/fst/pops_4bfst.txt;
wc -l output/fst/pops_4bfst.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## 40 output/fst/pops_4bfst.txt
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
head  output/fst/pops_4bfst_overlap.txt;
wc output/fst/pops_4bfst_overlap.txt
## ALD
## BAR
## BUL
## CRO
## FRS
## GES
## GRA
## GRC
## ITB
## ITP
##  25  25 125 output/fst/pops_4bfst_overlap.txt

We have 25 populations with both SNP & microsat data

6.1 Convert to raw format and subset the bed file

First load plink

module load PLINK/1.9b_6.21-x86_64
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/pops_4bfst_overlap.txt \
--recodeA \
--out output/fst/overlap \
--silent;
grep 'samples\|variants\|remaining' output/fst/overlap.log

20968 variants loaded from .bim file. –keep-fam: 242 people remaining. Total genotyping rate in remaining samples is 0.973943. 20968 variants and 242 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

LD2 <-
  read.PLINK(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/overlap.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )
## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
summary(LD2)
##   Length    Class     Mode 
##        1 genlight       S4

6.2 Convert the genlight object to Stampp format, and estimate pairwide Fst values

# convert
LD2_2 <- stamppConvert(LD2, type="genlight")

This chunk will take a couple minutes to run.

# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
LD2_3 <- stamppFst(LD2_2, 1, 95, 1)

Save it

saveRDS(
  LD2_3, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/LD2_MAF1_overlap.rds"
  )
)

To load it

LD2_3 <- readRDS(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_MAF1_overlap.rds"
  )
)

Now lets look at the object

summary(LD2_3)
##       SOC               GES               STS               ITR         
##  Min.   :0.01127   Min.   :0.09666   Min.   :0.03010   Min.   :0.01703  
##  1st Qu.:0.10175   1st Qu.:0.10817   1st Qu.:0.04447   1st Qu.:0.03965  
##  Median :0.11561   Median :0.12145   Median :0.06819   Median :0.05552  
##  Mean   :0.11638   Mean   :0.12689   Mean   :0.06935   Mean   :0.05977  
##  3rd Qu.:0.13102   3rd Qu.:0.13765   3rd Qu.:0.07530   3rd Qu.:0.07254  
##  Max.   :0.22672   Max.   :0.23290   Max.   :0.17530   Max.   :0.15959  
##  NA's   :1         NA's   :2         NA's   :3         NA's   :4        
##       GRC               BAR               BUL               CRO         
##  Min.   :0.02418   Min.   :0.04376   Min.   :0.05680   Min.   :0.04540  
##  1st Qu.:0.06578   1st Qu.:0.08255   1st Qu.:0.06811   1st Qu.:0.05812  
##  Median :0.07974   Median :0.10140   Median :0.09262   Median :0.07012  
##  Mean   :0.08455   Mean   :0.10467   Mean   :0.09302   Mean   :0.08014  
##  3rd Qu.:0.09938   3rd Qu.:0.12147   3rd Qu.:0.10293   3rd Qu.:0.09397  
##  Max.   :0.18931   Max.   :0.21778   Max.   :0.19614   Max.   :0.17329  
##  NA's   :5         NA's   :6         NA's   :7         NA's   :8        
##       GRA               ITB               MAL               SPM          
##  Min.   :0.05215   Min.   :0.05193   Min.   :0.02244   Min.   :0.006262  
##  1st Qu.:0.06488   1st Qu.:0.06287   1st Qu.:0.03897   1st Qu.:0.052607  
##  Median :0.07909   Median :0.08386   Median :0.06256   Median :0.066148  
##  Mean   :0.08549   Mean   :0.09413   Mean   :0.06264   Mean   :0.077446  
##  3rd Qu.:0.09821   3rd Qu.:0.10322   3rd Qu.:0.07392   3rd Qu.:0.093330  
##  Max.   :0.18592   Max.   :0.21971   Max.   :0.16130   Max.   :0.213896  
##  NA's   :9         NA's   :10        NA's   :11        NA's   :12        
##       TUA               TUH               ALD               FRS         
##  Min.   :0.02844   Min.   :0.03796   Min.   :0.07934   Min.   :0.03937  
##  1st Qu.:0.05808   1st Qu.:0.04592   1st Qu.:0.09148   1st Qu.:0.04284  
##  Median :0.06820   Median :0.07927   Median :0.11690   Median :0.06241  
##  Mean   :0.08408   Mean   :0.07715   Mean   :0.11822   Mean   :0.07281  
##  3rd Qu.:0.10014   3rd Qu.:0.08713   3rd Qu.:0.12858   3rd Qu.:0.07965  
##  Max.   :0.19864   Max.   :0.17311   Max.   :0.21576   Max.   :0.16404  
##  NA's   :13        NA's   :14        NA's   :15        NA's   :16       
##       ITP               POP               ROS               SER        
##  Min.   :0.03615   Min.   :0.03586   Min.   :0.05906   Min.   :0.1529  
##  1st Qu.:0.05294   1st Qu.:0.05241   1st Qu.:0.09236   1st Qu.:0.1877  
##  Median :0.07859   Median :0.06361   Median :0.09956   Median :0.2100  
##  Mean   :0.08048   Mean   :0.07861   Mean   :0.11744   Mean   :0.2029  
##  3rd Qu.:0.08405   3rd Qu.:0.08702   3rd Qu.:0.13313   3rd Qu.:0.2252  
##  Max.   :0.17461   Max.   :0.16508   Max.   :0.20308   Max.   :0.2386  
##  NA's   :17        NA's   :18        NA's   :19        NA's   :20      
##       SLO               SPC                SPB              SPS     
##  Min.   :0.05438   Min.   :0.007113   Min.   :0.1281   Min.   : NA  
##  1st Qu.:0.05841   1st Qu.:0.039494   1st Qu.:0.1281   1st Qu.: NA  
##  Median :0.06244   Median :0.071875   Median :0.1281   Median : NA  
##  Mean   :0.06970   Mean   :0.071875   Mean   :0.1281   Mean   :NaN  
##  3rd Qu.:0.07737   3rd Qu.:0.104256   3rd Qu.:0.1281   3rd Qu.: NA  
##  Max.   :0.09230   Max.   :0.136637   Max.   :0.1281   Max.   : NA  
##  NA's   :21        NA's   :22         NA's   :23       NA's   :24

If you want you can save the fst values as csv.

# Convert to data frame
LD2_df <- data.frame(LD2_3)
# Save it
write.csv(LD2_df, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_df_01_overlap.csv"))

Check the Fst values

head(LD2_df)
##            SOC       GES        STS        ITR       GRC BAR BUL CRO GRA ITB
## SOC         NA        NA         NA         NA        NA  NA  NA  NA  NA  NA
## GES 0.01126978        NA         NA         NA        NA  NA  NA  NA  NA  NA
## STS 0.10591967 0.1092791         NA         NA        NA  NA  NA  NA  NA  NA
## ITR 0.09551819 0.1005738 0.03982659         NA        NA  NA  NA  NA  NA  NA
## GRC 0.11671692 0.1220816 0.07434949 0.05710568        NA  NA  NA  NA  NA  NA
## BAR 0.13602044 0.1427050 0.09268239 0.06549046 0.1009452  NA  NA  NA  NA  NA
##     MAL SPM TUA TUH ALD FRS ITP POP ROS SER SLO SPC SPB SPS
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## STS  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## ITR  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GRC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## BAR  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Convert the data into a matrix.

aa <- as.matrix(LD2_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##            SOC        GES        STS        ITR        GRC        BAR
## SOC         NA 0.01126978 0.10591967 0.09551819 0.11671692 0.13602044
## GES 0.01126978         NA 0.10927914 0.10057375 0.12208159 0.14270495
## STS 0.10591967 0.10927914         NA 0.03982659 0.07434949 0.09268239
## ITR 0.09551819 0.10057375 0.03982659         NA 0.05710568 0.06549046
## GRC 0.11671692 0.12208159 0.07434949 0.05710568         NA 0.10094517
## BAR 0.13602044 0.14270495 0.09268239 0.06549046 0.10094517         NA
##            BUL        CRO        GRA        ITB        MAL        SPM
## SOC 0.12832355 0.11495017 0.10889177 0.13371682 0.09458982 0.12205750
## GES 0.13384104 0.12036716 0.11376844 0.13892193 0.10125102 0.12971049
## STS 0.07799253 0.06354859 0.07077473 0.07120783 0.03980952 0.06532393
## ITR 0.05392586 0.04949314 0.04798060 0.06838917 0.01702597 0.06977034
## GRC 0.09338818 0.05248230 0.02417956 0.09782363 0.05730552 0.09532555
## BAR 0.10885970 0.09393694 0.09225317 0.12946121 0.07823715 0.11048101
##            TUA        TUH        ALD        FRS        ITP        POP
## SOC 0.11826743 0.10435177 0.13932531 0.10279814 0.10070244 0.09884633
## GES 0.12401729 0.11137798 0.14452571 0.10779885 0.10598420 0.10461590
## STS 0.07272436 0.05322703 0.09733023 0.03009784 0.04109490 0.04320132
## ITR 0.03907520 0.03526920 0.08060398 0.03984728 0.02050277 0.03645268
## GRC 0.07973727 0.07020895 0.07388877 0.07086343 0.06244349 0.06910929
## BAR 0.06317696 0.04376254 0.12083140 0.08889532 0.08100898 0.08718046
##            ROS       SER        SLO        SPC        SPB        SPS
## SOC 0.13423764 0.2267217 0.09140753 0.12576249 0.11561471 0.15080720
## GES 0.13912010 0.2329000 0.09665973 0.13312607 0.12082033 0.15814054
## STS 0.07530389 0.1752962 0.04446887 0.06818588 0.06120598 0.09859755
## ITR 0.07758959 0.1595922 0.04133080 0.08683449 0.07820355 0.07085827
## GRC 0.10336915 0.1893113 0.05999989 0.10519477 0.09488816 0.10597149
## BAR 0.12469756 0.2177834 0.07855252 0.12926803 0.12167900 0.11394015
#sampling_loc <- read_csv("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_SNPs_fst.csv")

Save it

saveRDS(
  sampling_loc, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"
  )
)

Import sample locations

sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))


# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    order
  )

# Check it
head(sampling_loc)
## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##  [1] "FRS" "STS" "POP" "SPB" "SPS" "SPC" "BAR" "SPM" "ITB" "ITR" "ITP" "MAL"
## [13] "SLO" "CRO" "ALD" "SER" "GRA" "GRC" "ROS" "BUL" "TUA" "TUH" "SOC" "GES"

Create vector with order of populations

# Extract the populations that appear in LD2_df
populations_in_LD2 <- colnames(LD2_df)

# Reorder the populations based on order_pops
poporder <- populations_in_LD2[populations_in_LD2 %in% order_pops]

#LD2_df[match(poporder, LD2_df$Abbreviation),] #this also doesn't reorder it

# Print the reordered populations
print(poporder)
##  [1] "SOC" "GES" "STS" "ITR" "GRC" "BAR" "BUL" "CRO" "GRA" "ITB" "MAL" "SPM"
## [13] "TUA" "TUH" "ALD" "FRS" "ITP" "POP" "ROS" "SER" "SLO" "SPC" "SPB" "SPS"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[order_pops, order_pops]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1          Var2         value        
##  FRS    : 24   FRS    : 24   Min.   :0.00626  
##  STS    : 24   STS    : 24   1st Qu.:0.06090  
##  POP    : 24   POP    : 24   Median :0.08701  
##  SPB    : 24   SPB    : 24   Mean   :0.09085  
##  SPS    : 24   SPS    : 24   3rd Qu.:0.10948  
##  SPC    : 24   SPC    : 24   Max.   :0.23859  
##  (Other):432   (Other):432   NA's   :300

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions. Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

Save it

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_matrix_europe_MAF01_overlap.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

6.3 Fst by country for SNP Set overlap

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(LD2_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
2
## [1] 2
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##             Albania   Bulgaria    Croatia     France    Georgia     Greece
## Albania          NA 0.11279822 0.04539978 0.09472371 0.14452571 0.07649130
## Bulgaria 0.11279822         NA 0.08722725 0.07492144 0.13384104 0.09300545
## Croatia  0.04539978 0.08722725         NA 0.06302575 0.12036716 0.05544159
## France   0.09472371 0.07492144 0.06302575         NA 0.10853899 0.07064597
## Georgia  0.14452571 0.13384104 0.12036716 0.10853899         NA 0.11792501
## Greece   0.07649130 0.09300545 0.05544159 0.07064597 0.11792501         NA
## Italy    0.09154900 0.06636464 0.05965627 0.04890522 0.11515996 0.06500279
## Malta    0.08316641 0.05679897 0.05500955 0.03897441 0.10125102 0.05472746
## Portugal 0.09147718 0.06636208 0.06402375 0.04227483 0.10461590 0.06798589
## Romania  0.12354519 0.10293539 0.09925514 0.07490660 0.13912010 0.10150763
## Russia   0.13932531 0.12832355 0.11495017 0.10435891 0.01126978 0.11280435
## Serbia   0.21575958 0.19614135 0.17328819 0.16966823 0.23289996 0.18761510
## Slovenia 0.07933804 0.06530144 0.05729126 0.04390333 0.09665973 0.05726520
## Spain    0.12227116 0.10696192 0.09537205 0.07523979 0.13690047 0.09579872
## Turkey   0.09399600 0.07568412 0.07012296 0.05986011 0.11769763 0.07154177
##               Italy      Malta   Portugal    Romania     Russia    Serbia
## Albania  0.09154900 0.08316641 0.09147718 0.12354519 0.13932531 0.2157596
## Bulgaria 0.06636464 0.05679897 0.06636208 0.10293539 0.12832355 0.1961414
## Croatia  0.05965627 0.05500955 0.06402375 0.09925514 0.11495017 0.1732882
## France   0.04890522 0.03897441 0.04227483 0.07490660 0.10435891 0.1696682
## Georgia  0.11515996 0.10125102 0.10461590 0.13912010 0.01126978 0.2329000
## Greece   0.06500279 0.05472746 0.06798589 0.10150763 0.11280435 0.1876151
## Italy            NA 0.03035905 0.04157277 0.08661136 0.10997915 0.1846366
## Malta    0.03035905         NA 0.02975032 0.07357926 0.09458982 0.1612972
## Portugal 0.04157277 0.02975032         NA 0.07259454 0.09884633 0.1650812
## Romania  0.08661136 0.07357926 0.07259454         NA 0.13423764 0.2030822
## Russia   0.10997915 0.09458982 0.09884633 0.13423764         NA 0.2267217
## Serbia   0.18463665 0.16129723 0.16508121 0.20308218 0.22672167        NA
## Slovenia 0.04478448 0.03916767 0.03586147 0.05906033 0.09140753 0.1528773
## Spain    0.08665648 0.07372547 0.06801861 0.10896586 0.13005247 0.2176911
## Turkey   0.05623848 0.04497299 0.05514368 0.09174178 0.11130960 0.1858761
##            Slovenia      Spain     Turkey
## Albania  0.07933804 0.12227116 0.09399600
## Bulgaria 0.06530144 0.10696192 0.07568412
## Croatia  0.05729126 0.09537205 0.07012296
## France   0.04390333 0.07523979 0.05986011
## Georgia  0.09665973 0.13690047 0.11769763
## Greece   0.05726520 0.09579872 0.07154177
## Italy    0.04478448 0.08665648 0.05623848
## Malta    0.03916767 0.07372547 0.04497299
## Portugal 0.03586147 0.06801861 0.05514368
## Romania  0.05906033 0.10896586 0.09174178
## Russia   0.09140753 0.13005247 0.11130960
## Serbia   0.15287729 0.21769108 0.18587606
## Slovenia         NA 0.06748962 0.05168875
## Spain    0.06748962         NA 0.07429569
## Turkey   0.05168875 0.07429569         NA
symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)
##          Albania  Bulgaria    Croatia     France   Georgia     Greece
## Albania       NA 0.1127982 0.04539978 0.09472371 0.1445257 0.07649130
## Bulgaria      NA        NA 0.08722725 0.07492144 0.1338410 0.09300545
## Croatia       NA        NA         NA 0.06302575 0.1203672 0.05544159
## France        NA        NA         NA         NA 0.1085390 0.07064597
## Georgia       NA        NA         NA         NA        NA 0.11792501
## Greece        NA        NA         NA         NA        NA         NA
## Italy         NA        NA         NA         NA        NA         NA
## Malta         NA        NA         NA         NA        NA         NA
## Portugal      NA        NA         NA         NA        NA         NA
## Romania       NA        NA         NA         NA        NA         NA
## Russia        NA        NA         NA         NA        NA         NA
## Serbia        NA        NA         NA         NA        NA         NA
## Slovenia      NA        NA         NA         NA        NA         NA
## Spain         NA        NA         NA         NA        NA         NA
## Turkey        NA        NA         NA         NA        NA         NA
##               Italy      Malta   Portugal    Romania     Russia    Serbia
## Albania  0.09154900 0.08316641 0.09147718 0.12354519 0.13932531 0.2157596
## Bulgaria 0.06636464 0.05679897 0.06636208 0.10293539 0.12832355 0.1961414
## Croatia  0.05965627 0.05500955 0.06402375 0.09925514 0.11495017 0.1732882
## France   0.04890522 0.03897441 0.04227483 0.07490660 0.10435891 0.1696682
## Georgia  0.11515996 0.10125102 0.10461590 0.13912010 0.01126978 0.2329000
## Greece   0.06500279 0.05472746 0.06798589 0.10150763 0.11280435 0.1876151
## Italy            NA 0.03035905 0.04157277 0.08661136 0.10997915 0.1846366
## Malta            NA         NA 0.02975032 0.07357926 0.09458982 0.1612972
## Portugal         NA         NA         NA 0.07259454 0.09884633 0.1650812
## Romania          NA         NA         NA         NA 0.13423764 0.2030822
## Russia           NA         NA         NA         NA         NA 0.2267217
## Serbia           NA         NA         NA         NA         NA        NA
## Slovenia         NA         NA         NA         NA         NA        NA
## Spain            NA         NA         NA         NA         NA        NA
## Turkey           NA         NA         NA         NA         NA        NA
##            Slovenia      Spain     Turkey
## Albania  0.07933804 0.12227116 0.09399600
## Bulgaria 0.06530144 0.10696192 0.07568412
## Croatia  0.05729126 0.09537205 0.07012296
## France   0.04390333 0.07523979 0.05986011
## Georgia  0.09665973 0.13690047 0.11769763
## Greece   0.05726520 0.09579872 0.07154177
## Italy    0.04478448 0.08665648 0.05623848
## Malta    0.03916767 0.07372547 0.04497299
## Portugal 0.03586147 0.06801861 0.05514368
## Romania  0.05906033 0.10896586 0.09174178
## Russia   0.09140753 0.13005247 0.11130960
## Serbia   0.15287729 0.21769108 0.18587606
## Slovenia         NA 0.06748962 0.05168875
## Spain            NA         NA 0.07429569
## Turkey           NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)
##        Var1           Var2         value        
##  Albania : 15   Albania : 15   Min.   :0.01127  
##  Bulgaria: 15   Bulgaria: 15   1st Qu.:0.06402  
##  Croatia : 15   Croatia : 15   Median :0.09148  
##  France  : 15   France  : 15   Mean   :0.09696  
##  Georgia : 15   Georgia : 15   3rd Qu.:0.11770  
##  Greece  : 15   Greece  : 15   Max.   :0.23290  
##  (Other) :135   (Other) :135   NA's   :120

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_matrix_europe_MAF01_by_country_overlap.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 24 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 FRS     0.0690
##  2 STS     0.0727
##  3 POP     0.0662
##  4 SPB     0.0861
##  5 SPS     0.112 
##  6 SPC     0.0955
##  7 BAR     0.105 
##  8 SPM     0.0846
##  9 ITB     0.0975
## 10 ITR     0.0622
## # ℹ 14 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na()

# check output
head(fst3)
## # A tibble: 6 × 11
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 2 more variables: order <dbl>, mean_fst <dbl>

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 12
##    Pop_City       Country Latitude Longitude Continent Abbreviation  Year Region
##    <chr>          <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr> 
##  1 Saint-Martin-… France      45.2     5.77  Europe    FRS           2019 Weste…
##  2 Strasbourg     France      48.6     7.75  Europe    STS           2019 Weste…
##  3 Penafiel       Portug…     41.2    -8.33  Europe    POP           2017 South…
##  4 Badajoz        Spain       38.9    -6.97  Europe    SPB           2018 South…
##  5 San Roque      Spain       36.2    -5.37  Europe    SPS           2017 South…
##  6 Catarroja      Spain       39.4    -0.396 Europe    SPC           2017 South…
##  7 Barcelona      Spain       41.4     2.17  Europe    BAR           2018 South…
##  8 Magaluf        Spain       39.5     2.53  Europe    SPM           2017 South…
##  9 Bologna        Italy       44.5    11.4   Europe    ITB           2017 South…
## 10 Rome           Italy       41.9    12.5   Europe    ITR           2013 South…
## # ℹ 14 more rows
## # ℹ 4 more variables: Marker <chr>, order <dbl>, mean_fst <dbl>,
## #   mean_fst_by_region <dbl>

Mean By country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 13
##    Pop_City       Country Latitude Longitude Continent Abbreviation  Year Region
##    <chr>          <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr> 
##  1 Saint-Martin-… France      45.2     5.77  Europe    FRS           2019 Weste…
##  2 Strasbourg     France      48.6     7.75  Europe    STS           2019 Weste…
##  3 Penafiel       Portug…     41.2    -8.33  Europe    POP           2017 South…
##  4 Badajoz        Spain       38.9    -6.97  Europe    SPB           2018 South…
##  5 San Roque      Spain       36.2    -5.37  Europe    SPS           2017 South…
##  6 Catarroja      Spain       39.4    -0.396 Europe    SPC           2017 South…
##  7 Barcelona      Spain       41.4     2.17  Europe    BAR           2018 South…
##  8 Magaluf        Spain       39.5     2.53  Europe    SPM           2017 South…
##  9 Bologna        Italy       44.5    11.4   Europe    ITB           2017 South…
## 10 Rome           Italy       41.9    12.5   Europe    ITR           2013 South…
## # ℹ 14 more rows
## # ℹ 5 more variables: Marker <chr>, order <dbl>, mean_fst <dbl>,
## #   mean_fst_by_region <dbl>, mean_fst_by_country <dbl>
# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City)

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 13
##    City    Country Latitude Longitude Continent Abbreviation  Year Region Marker
##    <chr>   <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
##  1 Saint-… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
##  2 Strasb… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
##  3 Penafi… Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
##  4 Badajoz Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
##  5 San Ro… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
##  6 Catarr… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
##  7 Barcel… Spain       41.4     2.17  Europe    BAR           2018 South… SNPs  
##  8 Magaluf Spain       39.5     2.53  Europe    SPM           2017 South… SNPs  
##  9 Bologna Italy       44.5    11.4   Europe    ITB           2017 South… SNPs  
## 10 Rome    Italy       41.9    12.5   Europe    ITR           2013 South… SNPs  
## # ℹ 14 more rows
## # ℹ 4 more variables: order <dbl>, mean_fst <dbl>, mean_fst_by_region <dbl>,
## #   mean_fst_by_country <dbl>

Mean By continent

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Continent) |>
  summarize(mean_fst_by_continent = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Continent")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 14
##    City    Country Latitude Longitude Continent Abbreviation  Year Region Marker
##    <chr>   <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
##  1 Saint-… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
##  2 Strasb… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
##  3 Penafi… Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
##  4 Badajoz Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
##  5 San Ro… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
##  6 Catarr… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
##  7 Barcel… Spain       41.4     2.17  Europe    BAR           2018 South… SNPs  
##  8 Magaluf Spain       39.5     2.53  Europe    SPM           2017 South… SNPs  
##  9 Bologna Italy       44.5    11.4   Europe    ITB           2017 South… SNPs  
## 10 Rome    Italy       41.9    12.5   Europe    ITR           2013 South… SNPs  
## # ℹ 14 more rows
## # ℹ 5 more variables: order <dbl>, mean_fst <dbl>, mean_fst_by_region <dbl>,
## #   mean_fst_by_country <dbl>, mean_fst_by_continent <dbl>
fst4 <- fst3 |>
  dplyr::select(
    Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
## # A tibble: 6 × 7
##   Region       mean_fst_by_region Country mean_fst_by_country City  Abbreviation
##   <chr>                     <dbl> <chr>                 <dbl> <chr> <chr>       
## 1 Eastern Eur…               0.11 Bulgar…                0.09 Lom   BUL         
## 2 Eastern Eur…               0.11 Georgia                0.12 Sakh… GES         
## 3 Eastern Eur…               0.11 Romania                0.1  Satu… ROS         
## 4 Eastern Eur…               0.11 Russia                 0.12 Sochi SOC         
## 5 Eastern Eur…               0.11 Serbia                 0.19 Novi… SER         
## 6 Eastern Eur…               0.11 Turkey                 0.08 Alia… TUA         
## # ℹ 1 more variable: mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values for European SNPs overlapping with microsat dataset.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values for European SNPs overlapping with microsat dataset.

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Eastern Europe

0.11

Bulgaria

0.09

Lom

BUL

0.09

Eastern Europe

0.11

Georgia

0.12

Sakhumi, Abkhazia

GES

0.12

Eastern Europe

0.11

Romania

0.10

Satu Mare

ROS

0.10

Eastern Europe

0.11

Russia

0.12

Sochi

SOC

0.12

Eastern Europe

0.11

Serbia

0.19

Novi Sad

SER

0.19

Eastern Europe

0.11

Turkey

0.08

Aliaga

TUA

0.08

Eastern Europe

0.11

Turkey

0.08

Hopa

TUH

0.07

Southern Europe

0.08

Albania

0.11

Durres

ALD

0.11

Southern Europe

0.08

Croatia

0.08

Dubrovnik

CRO

0.08

Southern Europe

0.08

Greece

0.08

Athens

GRA

0.08

Southern Europe

0.08

Greece

0.08

Chania

GRC

0.09

Southern Europe

0.08

Italy

0.08

Bologna

ITB

0.10

Southern Europe

0.08

Italy

0.08

Puglia

ITP

0.07

Southern Europe

0.08

Italy

0.08

Rome

ITR

0.06

Southern Europe

0.08

Malta

0.06

Luqa

MAL

0.06

Southern Europe

0.08

Portugal

0.07

Penafiel

POP

0.07

Southern Europe

0.08

Slovenia

0.06

Ajdovscina

SLO

0.06

Southern Europe

0.08

Spain

0.10

Badajoz

SPB

0.09

Southern Europe

0.08

Spain

0.10

Barcelona

BAR

0.11

Southern Europe

0.08

Spain

0.10

Catarroja

SPC

0.10

Southern Europe

0.08

Spain

0.10

Magaluf

SPM

0.08

Southern Europe

0.08

Spain

0.10

San Roque

SPS

0.11

Western Europe

0.07

France

0.07

Saint-Martin-d'Heres

FRS

0.07

Western Europe

0.07

France

0.07

Strasbourg

STS

0.07

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fstTable_Europe_SNPs_MAF01_overlap.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_country_europe_MAF01_overlap.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Save it

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/mean_fst_by_country_SNPs_MAF01_overlap.pdf"),
  width = 10,
  height = 10,
  units = "in"
)

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           FRS       STS       POP       SPB       SPS       SPC      BAR
## FRS    0.0000  412.1522 1225.4383 1263.7518 1371.4448  817.6702 511.8716
## STS  412.1522    0.0000 1509.1745 1601.1573 1750.4874 1213.5639 915.9614
## POP 1225.4383 1509.1745    0.0000  282.8409  614.5101  701.9619 878.2910
## SPB 1263.7518 1601.1573  282.8409    0.0000  331.7685  571.0509 827.0716
## SPS 1371.4448 1750.4874  614.5101  331.7685    0.0000  566.5153 874.4050
## SPC  817.6702 1213.5639  701.9619  571.0509  566.5153    0.0000 310.0074
##           SPM       ITB       ITR      ITP      MAL       SLO      CRO      ALD
## FRS  683.8845  448.0871  654.1062 1004.474 1269.904  628.7258 1037.845 1192.210
## STS 1095.6072  536.0256  834.5057 1098.126 1522.503  546.2343 1052.803 1230.239
## POP  939.4811 1644.8695 1734.4434 2102.566 2066.730 1851.8481 2197.100 2319.742
## SPB  822.8201 1643.6694 1683.0686 2040.301 1923.429 1869.2123 2157.626 2262.516
## SPS  787.3372 1690.3169 1669.1413 2002.327 1785.308 1929.6259 2144.409 2225.649
## SPC  251.7726 1124.6677 1124.2692 1474.434 1368.139 1363.2153 1600.981 1697.515
##           SER      GRA      GRC      ROS      BUL      TUA      TUH      SOC
## FRS 1100.2591 1709.174 1877.379 1341.427 1392.500 1884.818 2897.030 2688.509
## STS  988.6774 1763.684 1974.712 1124.852 1304.105 1885.544 2745.357 2511.584
## POP 2315.5228 2777.366 2876.039 2566.256 2591.385 3000.557 4103.736 3906.072
## SPB 2313.3538 2686.566 2760.206 2594.045 2569.099 2924.845 4076.402 3894.554
## SPS 2340.8167 2601.355 2642.916 2651.131 2568.416 2854.982 4053.790 3891.912
## SPC 1780.6744 2115.539 2193.176 2085.530 2019.617 2354.477 3518.971 3346.038
##          GES
## FRS 2793.592
## STS 2619.862
## POP 4009.674
## SPB 3995.220
## SPS 3988.376
## SPC 3444.704

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)
##   Distance FST row_index col_index
## 1        0   0       FRS       FRS
## 2        0   0       STS       STS
## 3        0   0       POP       POP
## 4        0   0       SPB       SPB
## 5        0   0       SPS       SPS
## 6        0   0       SPC       SPC

Fit linear regression

data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
 annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14                               ))  # Increase font size for y-axi
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_by_distance_SNPs_MAF01_overlap_with_equ.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

Without equation

data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
# annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, #r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14                               ))  # Increase font size for y-axi
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_by_distance_SNPs_MAF01_overlap.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     ALD    BAR    BUL    CRO    FRS    GES    GRA    GRC    ITB    ITP   
## ALD NA     "0.12" "0.11" "0.05" "0.09" "0.14" "0.08" "0.07" "0.12" "0.08"
## BAR "1446" NA     "0.11" "0.09" "0.09" "0.14" "0.09" "0.10" "0.13" "0.08"
## BUL "414"  "1742" NA     "0.09" "0.07" "0.13" "0.09" "0.09" "0.10" "0.06"
## CRO "180"  "1333" "428"  NA     "0.06" "0.12" "0.06" "0.05" "0.09" "0.05"
## FRS "1192" "512"  "1393" "1038" NA     "0.11" "0.07" "0.07" "0.06" "0.04"
## GES "1770" "3168" "1426" "1845" "2794" NA     "0.11" "0.12" "0.14" "0.11"
## GRA "534"  "1900" "656"  "711"  "1709" "1540" NA     "0.02" "0.10" "0.05"
## GRC "754"  "2007" "925"  "934"  "1877" "1675" "270"  NA     "0.10" "0.06"
## ITB "752"  "824"  "950"  "591"  "448"  "2365" "1279" "1467" NA     "0.07"
## ITP "224"  "1227" "604"  "201"  "1004" "1991" "705"  "884"  "583"  NA    
## ITR "585"  "862"  "899"  "477"  "654"  "2321" "1070" "1224" "302"  "371" 
## MAL "746"  "1233" "1157" "817"  "1270" "2395" "873"  "862"  "996"  "621" 
## POP "2320" "878"  "2591" "2197" "1225" "4010" "2777" "2876" "1645" "2103"
## ROS "771"  "1783" "445"  "683"  "1341" "1496" "1100" "1370" "961"  "884" 
## SER "442"  "1489" "316"  "322"  "1100" "1695" "884"  "1141" "672"  "520" 
## SLO "689"  "1059" "782"  "509"  "629"  "2166" "1220" "1440" "245"  "585" 
## SOC "1678" "3068" "1327" "1748" "2689" "109"  "1470" "1618" "2262" "1898"
## SPB "2263" "827"  "2569" "2158" "1264" "3995" "2687" "2760" "1644" "2040"
## SPC "1698" "310"  "2020" "1601" "818"  "3445" "2116" "2193" "1125" "1474"
## SPM "1450" "211"  "1782" "1360" "684"  "3204" "1864" "1944" "916"  "1227"
## SPS "2226" "874"  "2568" "2144" "1371" "3988" "2601" "2643" "1690" "2002"
## STS "1230" "916"  "1304" "1053" "412"  "2620" "1764" "1975" "536"  "1098"
## TUA "694"  "2123" "641"  "851"  "1885" "1265" "277"  "445"  "1441" "901" 
## TUH "1828" "3251" "1512" "1919" "2897" "194"  "1544" "1650" "2461" "2051"
##     ITR    MAL    POP    ROS    SER    SLO    SOC    SPB    SPC    SPM   
## ALD "0.08" "0.08" "0.09" "0.12" "0.22" "0.08" "0.14" "0.12" "0.13" "0.11"
## BAR "0.07" "0.08" "0.09" "0.12" "0.22" "0.08" "0.14" "0.12" "0.13" "0.11"
## BUL "0.05" "0.06" "0.07" "0.10" "0.20" "0.07" "0.13" "0.10" "0.10" "0.10"
## CRO "0.05" "0.06" "0.06" "0.10" "0.17" "0.06" "0.11" "0.09" "0.10" "0.09"
## FRS "0.04" "0.04" "0.04" "0.07" "0.16" "0.04" "0.10" "0.06" "0.07" "0.06"
## GES "0.10" "0.10" "0.10" "0.14" "0.23" "0.10" "0.01" "0.12" "0.13" "0.13"
## GRA "0.05" "0.05" "0.07" "0.10" "0.19" "0.05" "0.11" "0.09" "0.10" "0.09"
## GRC "0.06" "0.06" "0.07" "0.10" "0.19" "0.06" "0.12" "0.09" "0.11" "0.10"
## ITB "0.07" "0.06" "0.05" "0.10" "0.22" "0.06" "0.13" "0.08" "0.09" "0.09"
## ITP "0.02" "0.02" "0.04" "0.08" "0.17" "0.04" "0.10" "0.07" "0.08" "0.06"
## ITR NA     "0.02" "0.04" "0.08" "0.16" "0.04" "0.10" "0.08" "0.09" "0.07"
## MAL "694"  NA     "0.03" "0.07" "0.16" "0.04" "0.09" "0.07" "0.07" "0.06"
## POP "1734" "2067" NA     "0.07" "0.17" "0.04" "0.10" "0.05" "0.05" "0.05"
## ROS "1047" "1498" "2566" NA     "0.20" "0.06" "0.13" "0.09" "0.10" "0.09"
## SER "697"  "1139" "2316" "367"  NA     "0.15" "0.23" "0.20" "0.22" "0.21"
## SLO "455"  "1118" "1852" "726"  "476"  NA     "0.09" "0.05" "0.06" "0.05"
## SOC "2224" "2316" "3906" "1388" "1591" "2060" NA     "0.12" "0.13" "0.12"
## SPB "1683" "1923" "283"  "2594" "2313" "1869" "3895" NA     "0.01" "0.01"
## SPC "1124" "1368" "702"  "2086" "1781" "1363" "3346" "571"  NA     "0.03"
## SPM "883"  "1128" "939"  "1873" "1555" "1160" "3107" "823"  "252"  NA    
## SPS "1669" "1785" "615"  "2651" "2341" "1930" "3892" "332"  "567"  "787" 
## STS "835"  "1523" "1509" "1125" "989"  "546"  "2512" "1601" "1214" "1096"
## TUA "1271" "1148" "3001" "1057" "932"  "1340" "1198" "2925" "2354" "2103"
## TUH "2395" "2412" "4104" "1629" "1797" "2272" "283"  "4076" "3519" "3275"
##     SPS    STS    TUA    TUH   
## ALD "0.14" "0.10" "0.10" "0.09"
## BAR "0.11" "0.09" "0.06" "0.04"
## BUL "0.12" "0.08" "0.08" "0.07"
## CRO "0.11" "0.06" "0.07" "0.07"
## FRS "0.10" "0.03" "0.07" "0.05"
## GES "0.16" "0.11" "0.12" "0.11"
## GRA "0.10" "0.07" "0.07" "0.06"
## GRC "0.11" "0.07" "0.08" "0.07"
## ITB "0.13" "0.07" "0.10" "0.08"
## ITP "0.09" "0.04" "0.06" "0.04"
## ITR "0.07" "0.04" "0.04" "0.04"
## MAL "0.07" "0.04" "0.05" "0.04"
## POP "0.09" "0.04" "0.06" "0.05"
## ROS "0.13" "0.08" "0.10" "0.08"
## SER "0.24" "0.18" "0.20" "0.17"
## SLO "0.09" "0.04" "0.06" "0.04"
## SOC "0.15" "0.11" "0.12" "0.10"
## SPB "0.13" "0.06" "0.10" "0.08"
## SPC "0.14" "0.07" "0.11" "0.08"
## SPM "0.08" "0.07" "0.09" "0.07"
## SPS NA     "0.10" "0.03" "0.09"
## STS "1750" NA     "0.07" "0.05"
## TUA "2855" "1886" NA     "0.05"
## TUH "4054" "2745" "1267" NA
cities <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
cities <- as_tibble(cities)
head(cities)
## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     CRO    ITP    ITR    ALD    ITB    SLO    SER    BUL    FRS    MAL   
## CRO NA     "0.05" "0.05" "0.05" "0.09" "0.06" "0.17" "0.09" "0.06" "0.06"
## ITP "201"  NA     "0.02" "0.08" "0.07" "0.04" "0.17" "0.06" "0.04" "0.02"
## ITR "477"  "371"  NA     "0.08" "0.07" "0.04" "0.16" "0.05" "0.04" "0.02"
## ALD "180"  "224"  "585"  NA     "0.12" "0.08" "0.22" "0.11" "0.09" "0.08"
## ITB "591"  "583"  "302"  "752"  NA     "0.06" "0.22" "0.10" "0.06" "0.06"
## SLO "509"  "585"  "455"  "689"  "245"  NA     "0.15" "0.07" "0.04" "0.04"
## SER "322"  "520"  "697"  "442"  "672"  "476"  NA     "0.20" "0.16" "0.16"
## BUL "428"  "604"  "899"  "414"  "950"  "782"  "316"  NA     "0.07" "0.06"
## FRS "1038" "1004" "654"  "1192" "448"  "629"  "1100" "1393" NA     "0.04"
## MAL "817"  "621"  "694"  "746"  "996"  "1118" "1139" "1157" "1270" NA    
## ROS "683"  "884"  "1047" "771"  "961"  "726"  "367"  "445"  "1341" "1498"
## GRA "711"  "705"  "1070" "534"  "1279" "1220" "884"  "656"  "1709" "873" 
## STS "1053" "1098" "835"  "1230" "536"  "546"  "989"  "1304" "412"  "1523"
## BAR "1333" "1227" "862"  "1446" "824"  "1059" "1489" "1742" "512"  "1233"
## SPM "1360" "1227" "883"  "1450" "916"  "1160" "1555" "1782" "684"  "1128"
## TUA "851"  "901"  "1271" "694"  "1441" "1340" "932"  "641"  "1885" "1148"
## GRC "934"  "884"  "1224" "754"  "1467" "1440" "1141" "925"  "1877" "862" 
## SPC "1601" "1474" "1124" "1698" "1125" "1363" "1781" "2020" "818"  "1368"
## SPB "2158" "2040" "1683" "2263" "1644" "1869" "2313" "2569" "1264" "1923"
## SPS "2144" "2002" "1669" "2226" "1690" "1930" "2341" "2568" "1371" "1785"
## POP "2197" "2103" "1734" "2320" "1645" "1852" "2316" "2591" "1225" "2067"
## SOC "1748" "1898" "2224" "1678" "2262" "2060" "1591" "1327" "2689" "2316"
## GES "1845" "1991" "2321" "1770" "2365" "2166" "1695" "1426" "2794" "2395"
## TUH "1919" "2051" "2395" "1828" "2461" "2272" "1797" "1512" "2897" "2412"
##     ROS    GRA    STS    BAR    SPM    TUA    GRC    SPC    SPB    SPS   
## CRO "0.10" "0.06" "0.06" "0.09" "0.09" "0.07" "0.05" "0.10" "0.09" "0.11"
## ITP "0.08" "0.05" "0.04" "0.08" "0.06" "0.06" "0.06" "0.08" "0.07" "0.09"
## ITR "0.08" "0.05" "0.04" "0.07" "0.07" "0.04" "0.06" "0.09" "0.08" "0.07"
## ALD "0.12" "0.08" "0.10" "0.12" "0.11" "0.10" "0.07" "0.13" "0.12" "0.14"
## ITB "0.10" "0.10" "0.07" "0.13" "0.09" "0.10" "0.10" "0.09" "0.08" "0.13"
## SLO "0.06" "0.05" "0.04" "0.08" "0.05" "0.06" "0.06" "0.06" "0.05" "0.09"
## SER "0.20" "0.19" "0.18" "0.22" "0.21" "0.20" "0.19" "0.22" "0.20" "0.24"
## BUL "0.10" "0.09" "0.08" "0.11" "0.10" "0.08" "0.09" "0.10" "0.10" "0.12"
## FRS "0.07" "0.07" "0.03" "0.09" "0.06" "0.07" "0.07" "0.07" "0.06" "0.10"
## MAL "0.07" "0.05" "0.04" "0.08" "0.06" "0.05" "0.06" "0.07" "0.07" "0.07"
## ROS NA     "0.10" "0.08" "0.12" "0.09" "0.10" "0.10" "0.10" "0.09" "0.13"
## GRA "1100" NA     "0.07" "0.09" "0.09" "0.07" "0.02" "0.10" "0.09" "0.10"
## STS "1125" "1764" NA     "0.09" "0.07" "0.07" "0.07" "0.07" "0.06" "0.10"
## BAR "1783" "1900" "916"  NA     "0.11" "0.06" "0.10" "0.13" "0.12" "0.11"
## SPM "1873" "1864" "1096" "211"  NA     "0.09" "0.10" "0.03" "0.01" "0.08"
## TUA "1057" "277"  "1886" "2123" "2103" NA     "0.08" "0.11" "0.10" "0.03"
## GRC "1370" "270"  "1975" "2007" "1944" "445"  NA     "0.11" "0.09" "0.11"
## SPC "2086" "2116" "1214" "310"  "252"  "2354" "2193" NA     "0.01" "0.14"
## SPB "2594" "2687" "1601" "827"  "823"  "2925" "2760" "571"  NA     "0.13"
## SPS "2651" "2601" "1750" "874"  "787"  "2855" "2643" "567"  "332"  NA    
## POP "2566" "2777" "1509" "878"  "939"  "3001" "2876" "702"  "283"  "615" 
## SOC "1388" "1470" "2512" "3068" "3107" "1198" "1618" "3346" "3895" "3892"
## GES "1496" "1540" "2620" "3168" "3204" "1265" "1675" "3445" "3995" "3988"
## TUH "1629" "1544" "2745" "3251" "3275" "1267" "1650" "3519" "4076" "4054"
##     POP    SOC    GES    TUH   
## CRO "0.06" "0.11" "0.12" "0.07"
## ITP "0.04" "0.10" "0.11" "0.04"
## ITR "0.04" "0.10" "0.10" "0.04"
## ALD "0.09" "0.14" "0.14" "0.09"
## ITB "0.05" "0.13" "0.14" "0.08"
## SLO "0.04" "0.09" "0.10" "0.04"
## SER "0.17" "0.23" "0.23" "0.17"
## BUL "0.07" "0.13" "0.13" "0.07"
## FRS "0.04" "0.10" "0.11" "0.05"
## MAL "0.03" "0.09" "0.10" "0.04"
## ROS "0.07" "0.13" "0.14" "0.08"
## GRA "0.07" "0.11" "0.11" "0.06"
## STS "0.04" "0.11" "0.11" "0.05"
## BAR "0.09" "0.14" "0.14" "0.04"
## SPM "0.05" "0.12" "0.13" "0.07"
## TUA "0.06" "0.12" "0.12" "0.05"
## GRC "0.07" "0.12" "0.12" "0.07"
## SPC "0.05" "0.13" "0.13" "0.08"
## SPB "0.05" "0.12" "0.12" "0.08"
## SPS "0.09" "0.15" "0.16" "0.09"
## POP NA     "0.10" "0.10" "0.05"
## SOC "3906" NA     "0.01" "0.10"
## GES "4010" "109"  NA     "0.11"
## TUH "4104" "283"  "194"  NA

Make a table and save it as a word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]

merged_df1 <- as.data.frame(merged_df)
write.csv(merged_df1, "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/merged_df_MAF01_overlap.csv")

# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df)) 

ft

Population

CRO

ITP

ITR

ALD

ITB

SLO

SER

BUL

FRS

MAL

ROS

GRA

STS

BAR

SPM

TUA

GRC

SPC

SPB

SPS

POP

SOC

GES

TUH

CRO

0.05

0.05

0.05

0.09

0.06

0.17

0.09

0.06

0.06

0.10

0.06

0.06

0.09

0.09

0.07

0.05

0.10

0.09

0.11

0.06

0.11

0.12

0.07

ITP

201

0.02

0.08

0.07

0.04

0.17

0.06

0.04

0.02

0.08

0.05

0.04

0.08

0.06

0.06

0.06

0.08

0.07

0.09

0.04

0.10

0.11

0.04

ITR

477

371

0.08

0.07

0.04

0.16

0.05

0.04

0.02

0.08

0.05

0.04

0.07

0.07

0.04

0.06

0.09

0.08

0.07

0.04

0.10

0.10

0.04

ALD

180

224

585

0.12

0.08

0.22

0.11

0.09

0.08

0.12

0.08

0.10

0.12

0.11

0.10

0.07

0.13

0.12

0.14

0.09

0.14

0.14

0.09

ITB

591

583

302

752

0.06

0.22

0.10

0.06

0.06

0.10

0.10

0.07

0.13

0.09

0.10

0.10

0.09

0.08

0.13

0.05

0.13

0.14

0.08

SLO

509

585

455

689

245

0.15

0.07

0.04

0.04

0.06

0.05

0.04

0.08

0.05

0.06

0.06

0.06

0.05

0.09

0.04

0.09

0.10

0.04

SER

322

520

697

442

672

476

0.20

0.16

0.16

0.20

0.19

0.18

0.22

0.21

0.20

0.19

0.22

0.20

0.24

0.17

0.23

0.23

0.17

BUL

428

604

899

414

950

782

316

0.07

0.06

0.10

0.09

0.08

0.11

0.10

0.08

0.09

0.10

0.10

0.12

0.07

0.13

0.13

0.07

FRS

1038

1004

654

1192

448

629

1100

1393

0.04

0.07

0.07

0.03

0.09

0.06

0.07

0.07

0.07

0.06

0.10

0.04

0.10

0.11

0.05

MAL

817

621

694

746

996

1118

1139

1157

1270

0.07

0.05

0.04

0.08

0.06

0.05

0.06

0.07

0.07

0.07

0.03

0.09

0.10

0.04

ROS

683

884

1047

771

961

726

367

445

1341

1498

0.10

0.08

0.12

0.09

0.10

0.10

0.10

0.09

0.13

0.07

0.13

0.14

0.08

GRA

711

705

1070

534

1279

1220

884

656

1709

873

1100

0.07

0.09

0.09

0.07

0.02

0.10

0.09

0.10

0.07

0.11

0.11

0.06

STS

1053

1098

835

1230

536

546

989

1304

412

1523

1125

1764

0.09

0.07

0.07

0.07

0.07

0.06

0.10

0.04

0.11

0.11

0.05

BAR

1333

1227

862

1446

824

1059

1489

1742

512

1233

1783

1900

916

0.11

0.06

0.10

0.13

0.12

0.11

0.09

0.14

0.14

0.04

SPM

1360

1227

883

1450

916

1160

1555

1782

684

1128

1873

1864

1096

211

0.09

0.10

0.03

0.01

0.08

0.05

0.12

0.13

0.07

TUA

851

901

1271

694

1441

1340

932

641

1885

1148

1057

277

1886

2123

2103

0.08

0.11

0.10

0.03

0.06

0.12

0.12

0.05

GRC

934

884

1224

754

1467

1440

1141

925

1877

862

1370

270

1975

2007

1944

445

0.11

0.09

0.11

0.07

0.12

0.12

0.07

SPC

1601

1474

1124

1698

1125

1363

1781

2020

818

1368

2086

2116

1214

310

252

2354

2193

0.01

0.14

0.05

0.13

0.13

0.08

SPB

2158

2040

1683

2263

1644

1869

2313

2569

1264

1923

2594

2687

1601

827

823

2925

2760

571

0.13

0.05

0.12

0.12

0.08

SPS

2144

2002

1669

2226

1690

1930

2341

2568

1371

1785

2651

2601

1750

874

787

2855

2643

567

332

0.09

0.15

0.16

0.09

POP

2197

2103

1734

2320

1645

1852

2316

2591

1225

2067

2566

2777

1509

878

939

3001

2876

702

283

615

0.10

0.10

0.05

SOC

1748

1898

2224

1678

2262

2060

1591

1327

2689

2316

1388

1470

2512

3068

3107

1198

1618

3346

3895

3892

3906

0.01

0.10

GES

1845

1991

2321

1770

2365

2166

1695

1426

2794

2395

1496

1540

2620

3168

3204

1265

1675

3445

3995

3988

4010

109

0.11

TUH

1919

2051

2395

1828

2461

2272

1797

1512

2897

2412

1629

1544

2745

3251

3275

1267

1650

3519

4076

4054

4104

283

194

6.4. Mantel test with Europe Set 3 SNPs overlapping with microsats

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/snps_sets/r2_0.01_b \
--keep-fam output/fst/pops_4bfst_overlap.txt \
--make-bed \
--out output/fst/mantel_overlap \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel_overlap.log

20968 variants loaded from .bim file. –keep-fam: 242 people remaining. Total genotyping rate in remaining samples is 0.973943. 20968 variants and 242 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/mantel_overlap \
--recodeA \
--out output/fst/mantel_overlap \
--silent;
grep 'samples\|variants\|remaining' output/fst/mantel_overlap.log

20968 variants loaded from .bim file. 20968 variants and 242 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/mantel_overlap.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/albo2_overlap_MAF01.rds"
))

Load it

albo2 <- readRDS(here(
 "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/albo2_overlap_MAF01.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/mantel_overlap.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      SOC         1065          0          0   0        -9    Sochi  Russia
## 2      SOC         1066          0          0   0        -9    Sochi  Russia
## 3      SOC         1067          0          0   0        -9    Sochi  Russia
## 4      SOC         1068          0          0   0        -9    Sochi  Russia
## 5      SOC         1069          0          0   0        -9    Sochi  Russia
## 6      SOC         1070          0          0   0        -9    Sochi  Russia
##   Latitude Longitude Continent Year         Region Marker order
## 1 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 2 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 3 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 4 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 5 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 6 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 43.60042 39.74533
## [2,] 43.60042 39.74533
## [3,] 43.60042 39.74533
## [4,] 43.60042 39.74533
## [5,] 43.60042 39.74533
## [6,] 43.60042 39.74533
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 43.59569 39.74613
## [2,] 43.59772 39.74774
## [3,] 43.60103 39.74744
## [4,] 43.60969 39.74969
## [5,] 43.59586 39.75527
## [6,] 43.59112 39.75388

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/albo2_SNP_overlap_MAF1.rds"
  )
)

6.4.1 Isolation By Distance for Europe (Set 3) SNPs overlapping with microsats

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: -0.09969185 
## 
## Based on 999 replicates
## Simulated p-value: 0.76 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## -0.749420367 -0.001071686  0.017317312

Monte-Carlo test Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)

Observation: 0.08178366 Based on 999 replicates Simulated p-value: 0.225 Alternative hypothesis: greater

Std.Obs Expectation    Variance 

0.68997045 -0.00749107 0.01674157

Simulated p-value is not statistically signficant (0.225). Alternative hypothesis: greater. The test was one-sided, checking if the observed correlation is greater than what would be expected by chance.

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/simIBD_SNPs_MAF01_overlap.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/Genetic_v_Geog_distance_SNPs_MAF01_overlap.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Save Plot without equation

# Plot it
# Start the PDF device
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/Genetic_v_Geog_distance_SNPs_MAF01_overlap_noequ.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

#text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
#  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/IDB_PlotFromMASS_density_SNPs_MAF01_overlap.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
# CairoPDF(here("output", "fst", "ibd.pdf"),
#     width = 5,
#     height = 4)
# png(here("output", "fst", "ibd2.png"),
#     width = 5,
#     height = 4,
#     units='in',
#     res = 300)
#myPal <-
 # colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()

Save plot with equation

png(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/IDB_PlotFromMASS_density_SNPs_MAF01_overlap_equation.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()

7. Import altered .fam file we created for MAF 1% dataset overlap with microsats (ITR/ROM merged)

#The fam file is the same for both data sets with the default or new priors
fam1 <-
  read.delim(
    file   = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/r2_0.01_b.fam"),
    header = FALSE,
    
  )
head(fam1)
##    V1   V2 V3 V4 V5 V6
## 1 SOC 1065  0  0  0 -9
## 2 SOC 1066  0  0  0 -9
## 3 SOC 1067  0  0  0 -9
## 4 SOC 1068  0  0  0 -9
## 5 SOC 1069  0  0  0 -9
## 6 SOC 1070  0  0  0 -9

Create list SNP pops with at least 4 individuals shared with microsats

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe

awk '{print $1}' output/snps_sets/r2_0.01_b.fam | sort | uniq -c | awk '{print $2, $1}' | awk '$2 >= 4 {print}' | awk '{print $1}' > output/fst/pops_4bfst.txt;
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
head  output/fst/pops_4bfst.txt;
wc -l output/fst/pops_4bfst.txt
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## 40 output/fst/pops_4bfst.txt
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
head  output/fst/pops_4bfst_overlap2.txt;
wc output/fst/pops_4bfst_overlap2.txt
## ALD
## BAR
## BUL
## CRO
## FRS
## GES
## GRA
## GRC
## ITB
## ITP
##  24  24 120 output/fst/pops_4bfst_overlap2.txt

Now we have 24 populations with both SNP & microsat data (with at least 4 individuals)

7.1 Convert to raw format and subset the bed file

First load plink

module load PLINK/1.9b_6.21-x86_64
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/overlap/r2_0.01_b \
--keep-fam output/fst/pops_4bfst_overlap2.txt \
--recodeA \
--out output/fst/overlap/overlap \
--silent;
grep 'samples\|variants\|remaining' output/fst/overlap/overlap.log

20968 variants loaded from .bim file. –keep-fam: 242 people remaining. Total genotyping rate in remaining samples is 0.973943. 20968 variants and 242 people pass filters and QC.

Look at https://rdrr.io/cran/StAMPP/man/stamppFst.html for details of Fst estimations

LD2 <-
  read.PLINK(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/overlap.raw"
    ),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )
## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
summary(LD2)
##   Length    Class     Mode 
##        1 genlight       S4

7.2 Convert the genlight object to Stampp format, and estimate pairwide Fst values

# convert
LD2_2 <- stamppConvert(LD2, type="genlight")

This chunk will take a couple minutes to run.

# run stampp. If you want to run with bootstraps and nclusters use the HPC. It will run out of memory on a 32Gb laptop
LD2_3 <- stamppFst(LD2_2, 1, 95, 1)

Save it

saveRDS(
  LD2_3, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_MAF1_overlap.rds"
  )
)

To load it

LD2_3 <- readRDS(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_MAF1_overlap.rds"
  )
)

Now lets look at the object

summary(LD2_3)
##       SOC               GES               STS               ITR         
##  Min.   :0.01127   Min.   :0.09666   Min.   :0.03010   Min.   :0.01703  
##  1st Qu.:0.10175   1st Qu.:0.10817   1st Qu.:0.04447   1st Qu.:0.03965  
##  Median :0.11561   Median :0.12145   Median :0.06819   Median :0.05552  
##  Mean   :0.11638   Mean   :0.12689   Mean   :0.06935   Mean   :0.05977  
##  3rd Qu.:0.13102   3rd Qu.:0.13765   3rd Qu.:0.07530   3rd Qu.:0.07254  
##  Max.   :0.22672   Max.   :0.23290   Max.   :0.17530   Max.   :0.15959  
##  NA's   :1         NA's   :2         NA's   :3         NA's   :4        
##       GRC               BAR               BUL               CRO         
##  Min.   :0.02418   Min.   :0.04376   Min.   :0.05680   Min.   :0.04540  
##  1st Qu.:0.06578   1st Qu.:0.08255   1st Qu.:0.06811   1st Qu.:0.05812  
##  Median :0.07974   Median :0.10140   Median :0.09262   Median :0.07012  
##  Mean   :0.08455   Mean   :0.10467   Mean   :0.09302   Mean   :0.08014  
##  3rd Qu.:0.09938   3rd Qu.:0.12147   3rd Qu.:0.10293   3rd Qu.:0.09397  
##  Max.   :0.18931   Max.   :0.21778   Max.   :0.19614   Max.   :0.17329  
##  NA's   :5         NA's   :6         NA's   :7         NA's   :8        
##       GRA               ITB               MAL               SPM          
##  Min.   :0.05215   Min.   :0.05193   Min.   :0.02244   Min.   :0.006262  
##  1st Qu.:0.06488   1st Qu.:0.06287   1st Qu.:0.03897   1st Qu.:0.052607  
##  Median :0.07909   Median :0.08386   Median :0.06256   Median :0.066148  
##  Mean   :0.08549   Mean   :0.09413   Mean   :0.06264   Mean   :0.077446  
##  3rd Qu.:0.09821   3rd Qu.:0.10322   3rd Qu.:0.07392   3rd Qu.:0.093330  
##  Max.   :0.18592   Max.   :0.21971   Max.   :0.16130   Max.   :0.213896  
##  NA's   :9         NA's   :10        NA's   :11        NA's   :12        
##       TUA               TUH               ALD               FRS         
##  Min.   :0.02844   Min.   :0.03796   Min.   :0.07934   Min.   :0.03937  
##  1st Qu.:0.05808   1st Qu.:0.04592   1st Qu.:0.09148   1st Qu.:0.04284  
##  Median :0.06820   Median :0.07927   Median :0.11690   Median :0.06241  
##  Mean   :0.08408   Mean   :0.07715   Mean   :0.11822   Mean   :0.07281  
##  3rd Qu.:0.10014   3rd Qu.:0.08713   3rd Qu.:0.12858   3rd Qu.:0.07965  
##  Max.   :0.19864   Max.   :0.17311   Max.   :0.21576   Max.   :0.16404  
##  NA's   :13        NA's   :14        NA's   :15        NA's   :16       
##       ITP               POP               ROS               SER        
##  Min.   :0.03615   Min.   :0.03586   Min.   :0.05906   Min.   :0.1529  
##  1st Qu.:0.05294   1st Qu.:0.05241   1st Qu.:0.09236   1st Qu.:0.1877  
##  Median :0.07859   Median :0.06361   Median :0.09956   Median :0.2100  
##  Mean   :0.08048   Mean   :0.07861   Mean   :0.11744   Mean   :0.2029  
##  3rd Qu.:0.08405   3rd Qu.:0.08702   3rd Qu.:0.13313   3rd Qu.:0.2252  
##  Max.   :0.17461   Max.   :0.16508   Max.   :0.20308   Max.   :0.2386  
##  NA's   :17        NA's   :18        NA's   :19        NA's   :20      
##       SLO               SPC                SPB              SPS     
##  Min.   :0.05438   Min.   :0.007113   Min.   :0.1281   Min.   : NA  
##  1st Qu.:0.05841   1st Qu.:0.039494   1st Qu.:0.1281   1st Qu.: NA  
##  Median :0.06244   Median :0.071875   Median :0.1281   Median : NA  
##  Mean   :0.06970   Mean   :0.071875   Mean   :0.1281   Mean   :NaN  
##  3rd Qu.:0.07737   3rd Qu.:0.104256   3rd Qu.:0.1281   3rd Qu.: NA  
##  Max.   :0.09230   Max.   :0.136637   Max.   :0.1281   Max.   : NA  
##  NA's   :21        NA's   :22         NA's   :23       NA's   :24

If you want you can save the fst values as csv.

# Convert to data frame
LD2_df <- data.frame(LD2_3)
# Save it
write.csv(LD2_df, file = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_df_01_overlap.csv"))

Check the Fst values

head(LD2_df)
##            SOC       GES        STS        ITR       GRC BAR BUL CRO GRA ITB
## SOC         NA        NA         NA         NA        NA  NA  NA  NA  NA  NA
## GES 0.01126978        NA         NA         NA        NA  NA  NA  NA  NA  NA
## STS 0.10591967 0.1092791         NA         NA        NA  NA  NA  NA  NA  NA
## ITR 0.09551819 0.1005738 0.03982659         NA        NA  NA  NA  NA  NA  NA
## GRC 0.11671692 0.1220816 0.07434949 0.05710568        NA  NA  NA  NA  NA  NA
## BAR 0.13602044 0.1427050 0.09268239 0.06549046 0.1009452  NA  NA  NA  NA  NA
##     MAL SPM TUA TUH ALD FRS ITP POP ROS SER SLO SPC SPB SPS
## SOC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GES  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## STS  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## ITR  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## GRC  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## BAR  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

Convert the data into a matrix.

aa <- as.matrix(LD2_df)
aa[upper.tri(aa)] <- t(aa)[upper.tri(t(aa))]
head(aa)
##            SOC        GES        STS        ITR        GRC        BAR
## SOC         NA 0.01126978 0.10591967 0.09551819 0.11671692 0.13602044
## GES 0.01126978         NA 0.10927914 0.10057375 0.12208159 0.14270495
## STS 0.10591967 0.10927914         NA 0.03982659 0.07434949 0.09268239
## ITR 0.09551819 0.10057375 0.03982659         NA 0.05710568 0.06549046
## GRC 0.11671692 0.12208159 0.07434949 0.05710568         NA 0.10094517
## BAR 0.13602044 0.14270495 0.09268239 0.06549046 0.10094517         NA
##            BUL        CRO        GRA        ITB        MAL        SPM
## SOC 0.12832355 0.11495017 0.10889177 0.13371682 0.09458982 0.12205750
## GES 0.13384104 0.12036716 0.11376844 0.13892193 0.10125102 0.12971049
## STS 0.07799253 0.06354859 0.07077473 0.07120783 0.03980952 0.06532393
## ITR 0.05392586 0.04949314 0.04798060 0.06838917 0.01702597 0.06977034
## GRC 0.09338818 0.05248230 0.02417956 0.09782363 0.05730552 0.09532555
## BAR 0.10885970 0.09393694 0.09225317 0.12946121 0.07823715 0.11048101
##            TUA        TUH        ALD        FRS        ITP        POP
## SOC 0.11826743 0.10435177 0.13932531 0.10279814 0.10070244 0.09884633
## GES 0.12401729 0.11137798 0.14452571 0.10779885 0.10598420 0.10461590
## STS 0.07272436 0.05322703 0.09733023 0.03009784 0.04109490 0.04320132
## ITR 0.03907520 0.03526920 0.08060398 0.03984728 0.02050277 0.03645268
## GRC 0.07973727 0.07020895 0.07388877 0.07086343 0.06244349 0.06910929
## BAR 0.06317696 0.04376254 0.12083140 0.08889532 0.08100898 0.08718046
##            ROS       SER        SLO        SPC        SPB        SPS
## SOC 0.13423764 0.2267217 0.09140753 0.12576249 0.11561471 0.15080720
## GES 0.13912010 0.2329000 0.09665973 0.13312607 0.12082033 0.15814054
## STS 0.07530389 0.1752962 0.04446887 0.06818588 0.06120598 0.09859755
## ITR 0.07758959 0.1595922 0.04133080 0.08683449 0.07820355 0.07085827
## GRC 0.10336915 0.1893113 0.05999989 0.10519477 0.09488816 0.10597149
## BAR 0.12469756 0.2177834 0.07855252 0.12926803 0.12167900 0.11394015
#sampling_loc <- read_csv("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_SNPs_fst.csv")

Save it

saveRDS(
  sampling_loc, here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"
  )
)

Import sample locations

sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))


# Arrange by region 
sampling_loc <- sampling_loc |>
  dplyr::arrange(
    order
  )

# Check it
head(sampling_loc)
## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

Order

order_pops <- as.vector(sampling_loc$Abbreviation)
order_pops
##  [1] "FRS" "STS" "POP" "SPB" "SPS" "SPC" "BAR" "SPM" "ITB" "ITR" "ITP" "MAL"
## [13] "SLO" "CRO" "ALD" "SER" "GRA" "GRC" "ROS" "BUL" "TUA" "TUH" "SOC" "GES"

Create vector with order of populations

# Extract the populations that appear in LD2_df
populations_in_LD2 <- colnames(LD2_df)

# Reorder the populations based on order_pops
poporder <- populations_in_LD2[populations_in_LD2 %in% order_pops]

#LD2_df[match(poporder, LD2_df$Abbreviation),] #this also doesn't reorder it

# Print the reordered populations
print(poporder)
##  [1] "SOC" "GES" "STS" "ITR" "GRC" "BAR" "BUL" "CRO" "GRA" "ITB" "MAL" "SPM"
## [13] "TUA" "TUH" "ALD" "FRS" "ITP" "POP" "ROS" "SER" "SLO" "SPC" "SPB" "SPS"

Lets check if the matrix is symmetric.

isSymmetric(aa)
## [1] TRUE

Order the matrix using poporder. We will also add NA on the upper left side of the matrix.

aa <- aa[order_pops, order_pops]
aa[lower.tri(aa)] <- NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long <- melt(aa)
summary(pairfst.long)
##       Var1          Var2         value        
##  FRS    : 24   FRS    : 24   Min.   :0.00626  
##  STS    : 24   STS    : 24   1st Qu.:0.06090  
##  POP    : 24   POP    : 24   Median :0.08701  
##  SPB    : 24   SPB    : 24   Mean   :0.09085  
##  SPS    : 24   SPS    : 24   3rd Qu.:0.10948  
##  SPC    : 24   SPC    : 24   Max.   :0.23859  
##  (Other):432   (Other):432   NA's   :300

Now lets plot the data with ggplot. You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f <- ggplot(pairfst.long, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 0)
  )
pairfst.f

Save it

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_matrix_europe_MAF01_overlap.pdf"),
  pairfst.f,
  width = 10,
  height = 10,
  units = "in"
)

7.3 Fst by country for SNP Set overlap

# Step 1: Map abbreviation to country
abbreviation_to_country <- sampling_loc %>% dplyr::select(Abbreviation, Country)

# Step 2: Calculate mean Fst for each pair of countries

# Convert the matrix to a data frame and add row names as a new column
fst_df <- as.data.frame(as.matrix(LD2_df))
fst_df$Abbreviation1 <- rownames(fst_df)

# Gather columns into rows
fst_long <- fst_df %>% gather(key = "Abbreviation2", value = "Fst", -Abbreviation1)

# Merge with country mapping
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation1", by.y = "Abbreviation")
fst_long <- merge(fst_long, abbreviation_to_country, by.x = "Abbreviation2", by.y = "Abbreviation", suffixes = c("_1", "_2"))

# Calculate mean Fst for each pair of countries
fst_summary <- fst_long %>% 
  group_by(Country_1, Country_2) %>% 
  summarize(Mean_Fst = mean(Fst, na.rm = TRUE), .groups = 'drop') %>% 
  filter(Country_1 != Country_2)

# Convert summary back to a matrix form, avoiding the use of tibbles for row names
fst_matrix_summary <- as.data.frame(spread(fst_summary, key = Country_2, value = Mean_Fst))
rownames(fst_matrix_summary) <- fst_matrix_summary$Country_1
fst_matrix_summary <- fst_matrix_summary[, -1]
fst_matrix_summary <- as.matrix(fst_matrix_summary)

# Make the matrix symmetric by averaging the off-diagonal elements
symmetric_fst_matrix <- matrix(nrow = nrow(fst_matrix_summary), ncol = ncol(fst_matrix_summary))
rownames(symmetric_fst_matrix) <- rownames(fst_matrix_summary)
colnames(symmetric_fst_matrix) <- colnames(fst_matrix_summary)

for(i in 1:nrow(fst_matrix_summary)) {
  for(j in i:nrow(fst_matrix_summary)) {
    if (i == j) {
      symmetric_fst_matrix[i, j] <- fst_matrix_summary[i, j]
    } else {
      avg_value <- mean(c(fst_matrix_summary[i, j], fst_matrix_summary[j, i]), na.rm = TRUE)
      symmetric_fst_matrix[i, j] <- avg_value
      symmetric_fst_matrix[j, i] <- avg_value
    }
  }
}

# Check if the matrix is symmetric
2
## [1] 2
# print(isSymmetric(symmetric_fst_matrix))

# Your symmetric Fst matrix by country is now in symmetric_fst_matrix
print(symmetric_fst_matrix)
##             Albania   Bulgaria    Croatia     France    Georgia     Greece
## Albania          NA 0.11279822 0.04539978 0.09472371 0.14452571 0.07649130
## Bulgaria 0.11279822         NA 0.08722725 0.07492144 0.13384104 0.09300545
## Croatia  0.04539978 0.08722725         NA 0.06302575 0.12036716 0.05544159
## France   0.09472371 0.07492144 0.06302575         NA 0.10853899 0.07064597
## Georgia  0.14452571 0.13384104 0.12036716 0.10853899         NA 0.11792501
## Greece   0.07649130 0.09300545 0.05544159 0.07064597 0.11792501         NA
## Italy    0.09154900 0.06636464 0.05965627 0.04890522 0.11515996 0.06500279
## Malta    0.08316641 0.05679897 0.05500955 0.03897441 0.10125102 0.05472746
## Portugal 0.09147718 0.06636208 0.06402375 0.04227483 0.10461590 0.06798589
## Romania  0.12354519 0.10293539 0.09925514 0.07490660 0.13912010 0.10150763
## Russia   0.13932531 0.12832355 0.11495017 0.10435891 0.01126978 0.11280435
## Serbia   0.21575958 0.19614135 0.17328819 0.16966823 0.23289996 0.18761510
## Slovenia 0.07933804 0.06530144 0.05729126 0.04390333 0.09665973 0.05726520
## Spain    0.12227116 0.10696192 0.09537205 0.07523979 0.13690047 0.09579872
## Turkey   0.09399600 0.07568412 0.07012296 0.05986011 0.11769763 0.07154177
##               Italy      Malta   Portugal    Romania     Russia    Serbia
## Albania  0.09154900 0.08316641 0.09147718 0.12354519 0.13932531 0.2157596
## Bulgaria 0.06636464 0.05679897 0.06636208 0.10293539 0.12832355 0.1961414
## Croatia  0.05965627 0.05500955 0.06402375 0.09925514 0.11495017 0.1732882
## France   0.04890522 0.03897441 0.04227483 0.07490660 0.10435891 0.1696682
## Georgia  0.11515996 0.10125102 0.10461590 0.13912010 0.01126978 0.2329000
## Greece   0.06500279 0.05472746 0.06798589 0.10150763 0.11280435 0.1876151
## Italy            NA 0.03035905 0.04157277 0.08661136 0.10997915 0.1846366
## Malta    0.03035905         NA 0.02975032 0.07357926 0.09458982 0.1612972
## Portugal 0.04157277 0.02975032         NA 0.07259454 0.09884633 0.1650812
## Romania  0.08661136 0.07357926 0.07259454         NA 0.13423764 0.2030822
## Russia   0.10997915 0.09458982 0.09884633 0.13423764         NA 0.2267217
## Serbia   0.18463665 0.16129723 0.16508121 0.20308218 0.22672167        NA
## Slovenia 0.04478448 0.03916767 0.03586147 0.05906033 0.09140753 0.1528773
## Spain    0.08665648 0.07372547 0.06801861 0.10896586 0.13005247 0.2176911
## Turkey   0.05623848 0.04497299 0.05514368 0.09174178 0.11130960 0.1858761
##            Slovenia      Spain     Turkey
## Albania  0.07933804 0.12227116 0.09399600
## Bulgaria 0.06530144 0.10696192 0.07568412
## Croatia  0.05729126 0.09537205 0.07012296
## France   0.04390333 0.07523979 0.05986011
## Georgia  0.09665973 0.13690047 0.11769763
## Greece   0.05726520 0.09579872 0.07154177
## Italy    0.04478448 0.08665648 0.05623848
## Malta    0.03916767 0.07372547 0.04497299
## Portugal 0.03586147 0.06801861 0.05514368
## Romania  0.05906033 0.10896586 0.09174178
## Russia   0.09140753 0.13005247 0.11130960
## Serbia   0.15287729 0.21769108 0.18587606
## Slovenia         NA 0.06748962 0.05168875
## Spain    0.06748962         NA 0.07429569
## Turkey   0.05168875 0.07429569         NA
symmetric_fst_matrix[lower.tri(symmetric_fst_matrix)] <- NA
print(symmetric_fst_matrix)
##          Albania  Bulgaria    Croatia     France   Georgia     Greece
## Albania       NA 0.1127982 0.04539978 0.09472371 0.1445257 0.07649130
## Bulgaria      NA        NA 0.08722725 0.07492144 0.1338410 0.09300545
## Croatia       NA        NA         NA 0.06302575 0.1203672 0.05544159
## France        NA        NA         NA         NA 0.1085390 0.07064597
## Georgia       NA        NA         NA         NA        NA 0.11792501
## Greece        NA        NA         NA         NA        NA         NA
## Italy         NA        NA         NA         NA        NA         NA
## Malta         NA        NA         NA         NA        NA         NA
## Portugal      NA        NA         NA         NA        NA         NA
## Romania       NA        NA         NA         NA        NA         NA
## Russia        NA        NA         NA         NA        NA         NA
## Serbia        NA        NA         NA         NA        NA         NA
## Slovenia      NA        NA         NA         NA        NA         NA
## Spain         NA        NA         NA         NA        NA         NA
## Turkey        NA        NA         NA         NA        NA         NA
##               Italy      Malta   Portugal    Romania     Russia    Serbia
## Albania  0.09154900 0.08316641 0.09147718 0.12354519 0.13932531 0.2157596
## Bulgaria 0.06636464 0.05679897 0.06636208 0.10293539 0.12832355 0.1961414
## Croatia  0.05965627 0.05500955 0.06402375 0.09925514 0.11495017 0.1732882
## France   0.04890522 0.03897441 0.04227483 0.07490660 0.10435891 0.1696682
## Georgia  0.11515996 0.10125102 0.10461590 0.13912010 0.01126978 0.2329000
## Greece   0.06500279 0.05472746 0.06798589 0.10150763 0.11280435 0.1876151
## Italy            NA 0.03035905 0.04157277 0.08661136 0.10997915 0.1846366
## Malta            NA         NA 0.02975032 0.07357926 0.09458982 0.1612972
## Portugal         NA         NA         NA 0.07259454 0.09884633 0.1650812
## Romania          NA         NA         NA         NA 0.13423764 0.2030822
## Russia           NA         NA         NA         NA         NA 0.2267217
## Serbia           NA         NA         NA         NA         NA        NA
## Slovenia         NA         NA         NA         NA         NA        NA
## Spain            NA         NA         NA         NA         NA        NA
## Turkey           NA         NA         NA         NA         NA        NA
##            Slovenia      Spain     Turkey
## Albania  0.07933804 0.12227116 0.09399600
## Bulgaria 0.06530144 0.10696192 0.07568412
## Croatia  0.05729126 0.09537205 0.07012296
## France   0.04390333 0.07523979 0.05986011
## Georgia  0.09665973 0.13690047 0.11769763
## Greece   0.05726520 0.09579872 0.07154177
## Italy    0.04478448 0.08665648 0.05623848
## Malta    0.03916767 0.07372547 0.04497299
## Portugal 0.03586147 0.06801861 0.05514368
## Romania  0.05906033 0.10896586 0.09174178
## Russia   0.09140753 0.13005247 0.11130960
## Serbia   0.15287729 0.21769108 0.18587606
## Slovenia         NA 0.06748962 0.05168875
## Spain            NA         NA 0.07429569
## Turkey           NA         NA         NA

Now we have to convert the matrix to a data frame to plot it with ggplot.

pairfst.long2 <- melt(symmetric_fst_matrix)
summary(pairfst.long2)
##        Var1           Var2         value        
##  Albania : 15   Albania : 15   Min.   :0.01127  
##  Bulgaria: 15   Bulgaria: 15   1st Qu.:0.06402  
##  Croatia : 15   Croatia : 15   Median :0.09148  
##  France  : 15   France  : 15   Mean   :0.09696  
##  Georgia : 15   Georgia : 15   3rd Qu.:0.11770  
##  Greece  : 15   Greece  : 15   Max.   :0.23290  
##  (Other) :135   (Other) :135   NA's   :120

You can click in the little square on the top left of the plot to open it on a new window. It will have the right proportions.

pairfst.f2 <- ggplot(pairfst.long2, aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient(
    low = "white",
    high = "#71b6ff",
    name = "Fst",
    na.value = "white",
    limits = c(0, 0.5)
  ) +
  scale_x_discrete(position = "top") +
  theme_bw() +
  geom_text(aes(label = ifelse(
    is.na(value), "", formatC(value, digits = 2, format = "f")
  )), size = 3) +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.title = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.text.y = element_text(hjust = 1)
  )
pairfst.f2

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_matrix_europe_MAF01_by_country_overlap.pdf"),
  pairfst.f2, 
  width = 6,
  height = 5,
  units = "in"
)

Remove NAs and rename columns

# remove NAs
fst2 <-
  pairfst.long |>
  drop_na()

# rename columns
fst2 <-
  fst2 |>
  dplyr::rename(pop1 = 1,
         pop2 = 2,
         fst  = 3)


# Split the data into two data frames, one for pop1 and one for pop2
df_pop1 <- fst2 |>
  dplyr::select(pop = pop1, fst)
df_pop2 <- fst2 |>
  dplyr::select(pop = pop2, fst)

# Combine the two data frames
df_combined <- bind_rows(df_pop1, df_pop2)

# Calculate the mean fst for each population
mean_fst <- df_combined |>
  group_by(pop) |>
  summarise(mean_fst = mean(fst))

print(mean_fst)
## # A tibble: 24 × 2
##    pop   mean_fst
##    <fct>    <dbl>
##  1 FRS     0.0690
##  2 STS     0.0727
##  3 POP     0.0662
##  4 SPB     0.0861
##  5 SPS     0.112 
##  6 SPC     0.0955
##  7 BAR     0.105 
##  8 SPM     0.0846
##  9 ITB     0.0975
## 10 ITR     0.0622
## # ℹ 14 more rows

Merge

fst3 <-
  sampling_loc |>
  left_join(
    mean_fst,
    by = c("Abbreviation" = "pop")
  ) |>
  drop_na()

# check output
head(fst3)
## # A tibble: 6 × 11
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 2 more variables: order <dbl>, mean_fst <dbl>

Mean by region

# Group by Region and calculate the mean_fst by Region
region_means <- fst3 |>
  group_by(Region) |>
  summarize(mean_fst_by_region = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_region column to the fst3 tibble
fst3 <- fst3 |>
  left_join(region_means, by = "Region")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 12
##    Pop_City       Country Latitude Longitude Continent Abbreviation  Year Region
##    <chr>          <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr> 
##  1 Saint-Martin-… France      45.2     5.77  Europe    FRS           2019 Weste…
##  2 Strasbourg     France      48.6     7.75  Europe    STS           2019 Weste…
##  3 Penafiel       Portug…     41.2    -8.33  Europe    POP           2017 South…
##  4 Badajoz        Spain       38.9    -6.97  Europe    SPB           2018 South…
##  5 San Roque      Spain       36.2    -5.37  Europe    SPS           2017 South…
##  6 Catarroja      Spain       39.4    -0.396 Europe    SPC           2017 South…
##  7 Barcelona      Spain       41.4     2.17  Europe    BAR           2018 South…
##  8 Magaluf        Spain       39.5     2.53  Europe    SPM           2017 South…
##  9 Bologna        Italy       44.5    11.4   Europe    ITB           2017 South…
## 10 Rome           Italy       41.9    12.5   Europe    ITR           2013 South…
## # ℹ 14 more rows
## # ℹ 4 more variables: Marker <chr>, order <dbl>, mean_fst <dbl>,
## #   mean_fst_by_region <dbl>

Mean By country

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Country) |>
  summarize(mean_fst_by_country = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Country")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 13
##    Pop_City       Country Latitude Longitude Continent Abbreviation  Year Region
##    <chr>          <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr> 
##  1 Saint-Martin-… France      45.2     5.77  Europe    FRS           2019 Weste…
##  2 Strasbourg     France      48.6     7.75  Europe    STS           2019 Weste…
##  3 Penafiel       Portug…     41.2    -8.33  Europe    POP           2017 South…
##  4 Badajoz        Spain       38.9    -6.97  Europe    SPB           2018 South…
##  5 San Roque      Spain       36.2    -5.37  Europe    SPS           2017 South…
##  6 Catarroja      Spain       39.4    -0.396 Europe    SPC           2017 South…
##  7 Barcelona      Spain       41.4     2.17  Europe    BAR           2018 South…
##  8 Magaluf        Spain       39.5     2.53  Europe    SPM           2017 South…
##  9 Bologna        Italy       44.5    11.4   Europe    ITB           2017 South…
## 10 Rome           Italy       41.9    12.5   Europe    ITR           2013 South…
## # ℹ 14 more rows
## # ℹ 5 more variables: Marker <chr>, order <dbl>, mean_fst <dbl>,
## #   mean_fst_by_region <dbl>, mean_fst_by_country <dbl>
# Rename columns
fst3 <- fst3 |>
  dplyr::rename(
    City = Pop_City)

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 13
##    City    Country Latitude Longitude Continent Abbreviation  Year Region Marker
##    <chr>   <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
##  1 Saint-… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
##  2 Strasb… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
##  3 Penafi… Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
##  4 Badajoz Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
##  5 San Ro… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
##  6 Catarr… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
##  7 Barcel… Spain       41.4     2.17  Europe    BAR           2018 South… SNPs  
##  8 Magaluf Spain       39.5     2.53  Europe    SPM           2017 South… SNPs  
##  9 Bologna Italy       44.5    11.4   Europe    ITB           2017 South… SNPs  
## 10 Rome    Italy       41.9    12.5   Europe    ITR           2013 South… SNPs  
## # ℹ 14 more rows
## # ℹ 4 more variables: order <dbl>, mean_fst <dbl>, mean_fst_by_region <dbl>,
## #   mean_fst_by_country <dbl>

Mean By continent

# Group by Country and calculate the mean_fst by Country
country_means <- fst3 |>
  group_by(Continent) |>
  summarize(mean_fst_by_continent = round(mean(mean_fst, na.rm = TRUE), 2)) |>
  ungroup()  # Ungroup the data

# Add the mean_fst_by_country column to the fst3 tibble
fst3 <- fst3 |>
  left_join(country_means, by = "Continent")

# Print the modified fst3 tibble
print(fst3)
## # A tibble: 24 × 14
##    City    Country Latitude Longitude Continent Abbreviation  Year Region Marker
##    <chr>   <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
##  1 Saint-… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
##  2 Strasb… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
##  3 Penafi… Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
##  4 Badajoz Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
##  5 San Ro… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
##  6 Catarr… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
##  7 Barcel… Spain       41.4     2.17  Europe    BAR           2018 South… SNPs  
##  8 Magaluf Spain       39.5     2.53  Europe    SPM           2017 South… SNPs  
##  9 Bologna Italy       44.5    11.4   Europe    ITB           2017 South… SNPs  
## 10 Rome    Italy       41.9    12.5   Europe    ITR           2013 South… SNPs  
## # ℹ 14 more rows
## # ℹ 5 more variables: order <dbl>, mean_fst <dbl>, mean_fst_by_region <dbl>,
## #   mean_fst_by_country <dbl>, mean_fst_by_continent <dbl>
fst4 <- fst3 |>
  dplyr::select(
    Region, mean_fst_by_region, Country, mean_fst_by_country, City, Abbreviation, mean_fst,
  )

fst4 <- fst4 |>
  arrange(
    Region, Country, City
  )

# Round
fst4 <- fst4 |>
  mutate_if(is.numeric, ~ round(., 2))

head(fst4)
## # A tibble: 6 × 7
##   Region       mean_fst_by_region Country mean_fst_by_country City  Abbreviation
##   <chr>                     <dbl> <chr>                 <dbl> <chr> <chr>       
## 1 Eastern Eur…               0.11 Bulgar…                0.09 Lom   BUL         
## 2 Eastern Eur…               0.11 Georgia                0.12 Sakh… GES         
## 3 Eastern Eur…               0.11 Romania                0.1  Satu… ROS         
## 4 Eastern Eur…               0.11 Russia                 0.12 Sochi SOC         
## 5 Eastern Eur…               0.11 Serbia                 0.19 Novi… SER         
## 6 Eastern Eur…               0.11 Turkey                 0.08 Alia… TUA         
## # ℹ 1 more variable: mean_fst <dbl>
# Set theme if you want to use something different from the previous table
set_flextable_defaults(
  font.family = "Arial",
  font.size = 9,
  big.mark = ",",
  theme_fun = "theme_zebra" # try the themes: theme_alafoli(), theme_apa(), theme_booktabs(), theme_box(), theme_tron_legacy(), theme_tron(), theme_vader(), theme_vanilla(), theme_zebra()
)

# Then create the flextable object
flex_table <- flextable(fst4) |>
  set_caption(caption = as_paragraph(
    as_chunk(
      "Table 1. Fst values for European SNPs overlapping with microsat dataset.",
      props = fp_text_default(color = "#000000", font.size = 14)
    )
  ),
  fp_p = fp_par(text.align = "center", padding = 5))

# Print the flextable
flex_table
Table 1. Fst values for European SNPs overlapping with microsat dataset.

Region

mean_fst_by_region

Country

mean_fst_by_country

City

Abbreviation

mean_fst

Eastern Europe

0.11

Bulgaria

0.09

Lom

BUL

0.09

Eastern Europe

0.11

Georgia

0.12

Sakhumi, Abkhazia

GES

0.12

Eastern Europe

0.11

Romania

0.10

Satu Mare

ROS

0.10

Eastern Europe

0.11

Russia

0.12

Sochi

SOC

0.12

Eastern Europe

0.11

Serbia

0.19

Novi Sad

SER

0.19

Eastern Europe

0.11

Turkey

0.08

Aliaga

TUA

0.08

Eastern Europe

0.11

Turkey

0.08

Hopa

TUH

0.07

Southern Europe

0.08

Albania

0.11

Durres

ALD

0.11

Southern Europe

0.08

Croatia

0.08

Dubrovnik

CRO

0.08

Southern Europe

0.08

Greece

0.08

Athens

GRA

0.08

Southern Europe

0.08

Greece

0.08

Chania

GRC

0.09

Southern Europe

0.08

Italy

0.08

Bologna

ITB

0.10

Southern Europe

0.08

Italy

0.08

Puglia

ITP

0.07

Southern Europe

0.08

Italy

0.08

Rome

ITR

0.06

Southern Europe

0.08

Malta

0.06

Luqa

MAL

0.06

Southern Europe

0.08

Portugal

0.07

Penafiel

POP

0.07

Southern Europe

0.08

Slovenia

0.06

Ajdovscina

SLO

0.06

Southern Europe

0.08

Spain

0.10

Badajoz

SPB

0.09

Southern Europe

0.08

Spain

0.10

Barcelona

BAR

0.11

Southern Europe

0.08

Spain

0.10

Catarroja

SPC

0.10

Southern Europe

0.08

Spain

0.10

Magaluf

SPM

0.08

Southern Europe

0.08

Spain

0.10

San Roque

SPS

0.11

Western Europe

0.07

France

0.07

Saint-Martin-d'Heres

FRS

0.07

Western Europe

0.07

France

0.07

Strasbourg

STS

0.07

# Initialize Word document
doc <- 
  read_docx() |>
  body_add_flextable(value = flex_table)

# Define the output path with 'here' library
output_path <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fstTable_Europe_SNPs_MAF01_overlap.docx"
  )

# Save the Word document
print(doc, target = output_path)

To make scatter plot

# Group by Country and calculate the mean for mean_fst_by_country
aggregated_data <- fst4 |>
  dplyr::group_by(Country) |>
  dplyr::summarise(mean_fst = mean(mean_fst_by_country, na.rm = TRUE))

# save the data
saveRDS(aggregated_data, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/LD2_country_europe_MAF01_overlap.rds"
))

# Order the aggregated data
aggregated_data <- aggregated_data[order(aggregated_data$mean_fst), ]

# Assign a numeric index for plotting
aggregated_data$index <- 1:nrow(aggregated_data)

# Fit a linear model
lm_fit <- lm(mean_fst ~ index, data = aggregated_data)

# Predicted values from the linear model
aggregated_data$fitted_values <- predict(lm_fit)

ggplot(aggregated_data, aes(x = index, y = mean_fst)) +
  geom_point(aes(color = Country), size = 3) +
  geom_line(aes(y = fitted_values), color = "blue") +  # Fitted line
  labs(
    title = "Mean Fst by Country",
    x = "Ordered Countries",
    y = "Mean Fst Value"
  ) +
  scale_x_continuous(breaks = aggregated_data$index, labels = aggregated_data$Country) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(legend.position = "none")

Save it

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/mean_fst_by_country_SNPs_MAF01_overlap.pdf"),
  width = 10,
  height = 10,
  units = "in"
)

Estimate distances

# Grab the population names from the matrix aa
populations_with_fst <- colnames(aa)

# Subset the sampling_loc dataframe to only include populations with FST estimates
filtered_sampling_loc <- sampling_loc %>% filter(Abbreviation %in% populations_with_fst)

# Create an empty matrix to store the distances
n <- nrow(filtered_sampling_loc)
distance_matrix <- matrix(0, n, n)
rownames(distance_matrix) <- filtered_sampling_loc$Abbreviation
colnames(distance_matrix) <- filtered_sampling_loc$Abbreviation

# Calculate the distances
for (i in 1:n) {
  for (j in 1:n) {
    if (i != j) {
      coord1 <- c(filtered_sampling_loc$Longitude[i], filtered_sampling_loc$Latitude[i])
      coord2 <- c(filtered_sampling_loc$Longitude[j], filtered_sampling_loc$Latitude[j])
      distance_matrix[i, j] <- distHaversine(coord1, coord2) / 1000 # distance in km
    }
  }
}

# Print the distance matrix
head(distance_matrix)
##           FRS       STS       POP       SPB       SPS       SPC      BAR
## FRS    0.0000  412.1522 1225.4383 1263.7518 1371.4448  817.6702 511.8716
## STS  412.1522    0.0000 1509.1745 1601.1573 1750.4874 1213.5639 915.9614
## POP 1225.4383 1509.1745    0.0000  282.8409  614.5101  701.9619 878.2910
## SPB 1263.7518 1601.1573  282.8409    0.0000  331.7685  571.0509 827.0716
## SPS 1371.4448 1750.4874  614.5101  331.7685    0.0000  566.5153 874.4050
## SPC  817.6702 1213.5639  701.9619  571.0509  566.5153    0.0000 310.0074
##           SPM       ITB       ITR      ITP      MAL       SLO      CRO      ALD
## FRS  683.8845  448.0871  654.1062 1004.474 1269.904  628.7258 1037.845 1192.210
## STS 1095.6072  536.0256  834.5057 1098.126 1522.503  546.2343 1052.803 1230.239
## POP  939.4811 1644.8695 1734.4434 2102.566 2066.730 1851.8481 2197.100 2319.742
## SPB  822.8201 1643.6694 1683.0686 2040.301 1923.429 1869.2123 2157.626 2262.516
## SPS  787.3372 1690.3169 1669.1413 2002.327 1785.308 1929.6259 2144.409 2225.649
## SPC  251.7726 1124.6677 1124.2692 1474.434 1368.139 1363.2153 1600.981 1697.515
##           SER      GRA      GRC      ROS      BUL      TUA      TUH      SOC
## FRS 1100.2591 1709.174 1877.379 1341.427 1392.500 1884.818 2897.030 2688.509
## STS  988.6774 1763.684 1974.712 1124.852 1304.105 1885.544 2745.357 2511.584
## POP 2315.5228 2777.366 2876.039 2566.256 2591.385 3000.557 4103.736 3906.072
## SPB 2313.3538 2686.566 2760.206 2594.045 2569.099 2924.845 4076.402 3894.554
## SPS 2340.8167 2601.355 2642.916 2651.131 2568.416 2854.982 4053.790 3891.912
## SPC 1780.6744 2115.539 2193.176 2085.530 2019.617 2354.477 3518.971 3346.038
##          GES
## FRS 2793.592
## STS 2619.862
## POP 4009.674
## SPB 3995.220
## SPS 3988.376
## SPC 3444.704

Compare distance and FST

# Fill lower triangle of 'aa' matrix
aa[lower.tri(aa)] <- t(aa)[lower.tri(aa)]

# Fill diagonal with 0 (or another value that makes sense in your context)
diag(aa) <- 0


# Combine 'aa' and 'distance_matrix'
data <- data.frame(Distance = as.vector(distance_matrix), FST = as.vector(aa))

# Add row and column indices for easier tracking
data$row_index <- rep(rownames(distance_matrix), each = ncol(distance_matrix))
data$col_index <- rep(colnames(distance_matrix), nrow(distance_matrix))

data <- data |>
  dplyr::arrange(
    Distance
  )
head(data)
##   Distance FST row_index col_index
## 1        0   0       FRS       FRS
## 2        0   0       STS       STS
## 3        0   0       POP       POP
## 4        0   0       SPB       SPB
## 5        0   0       SPS       SPS
## 6        0   0       SPC       SPC

Fit linear regression

data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
 annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14                               ))  # Increase font size for y-axi
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_by_distance_SNPs_MAF01_overlap_with_equ.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

Without equation

data <- data[data$Distance > 0, ]

# Fit linear model
lm_model <- lm(FST/(1-FST) ~ log(Distance), data = data)
equation_text <- sprintf("y = %.6fx + %.3f", coef(lm_model)[2], coef(lm_model)[1])
r2_text <- sprintf("R^2 = %.2f", summary(lm_model)$r.squared)

# source the plotting function
source(here("analyses", "my_theme2.R"))


# Plot
ggplot(data, aes(x = log(Distance), y = FST/(1-FST))) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
# annotate("text", x = max(log((data$Distance))) * 0.85, y = max(data$FST/(1-data$FST)) * 0.95, label = paste(equation_text, #r2_text, sep = "\n"), size = 4, color = "black") +
  labs(title = "FST vs Distance - All populations",
       x = "Log(Distance)",
       y = "FST(1-FST)") +
  scale_x_continuous(labels = scales::comma) + 
  theme_classic() +
  theme(axis.text.x = element_text(size = 14),  # Increase font size for x-axis
        axis.text.y = element_text(size = 14                               ))  # Increase font size for y-axi
## `geom_smooth()` using formula = 'y ~ x'

ggsave(
  filename = here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/fst_by_distance_SNPs_MAF01_overlap.pdf"),
  width = 6,
  height = 4,
  units = "in"
)

We can merge the FST and distance matrices

# Ensure the matrices have the same names in the same order
common_names <- intersect(rownames(distance_matrix), rownames(aa))
sorted_names <- sort(common_names)

# Reorder the matrices
distance_matrix <- distance_matrix[sorted_names, sorted_names]
aa <- aa[sorted_names, sorted_names]

# Initialize the final merged matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names
colnames(merged_matrix) <- sorted_names

# Fill the upper triangular part from aa
merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]

# Fill the lower triangular part from distance_matrix
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Format the matrix (Fst two decimals and distance in Km with zero decimals)
# Format the elements based on their position in the matrix
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      # Upper triangular - Fst values with two decimal places
      merged_matrix[i, j] <- sprintf("%.2f", as.numeric(merged_matrix[i, j]))
    } else if (i > j) {
      # Lower triangular - Distance values with zero decimal places
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Now the merged_matrix should be formatted as you need
print(merged_matrix)
##     ALD    BAR    BUL    CRO    FRS    GES    GRA    GRC    ITB    ITP   
## ALD NA     "0.12" "0.11" "0.05" "0.09" "0.14" "0.08" "0.07" "0.12" "0.08"
## BAR "1446" NA     "0.11" "0.09" "0.09" "0.14" "0.09" "0.10" "0.13" "0.08"
## BUL "414"  "1742" NA     "0.09" "0.07" "0.13" "0.09" "0.09" "0.10" "0.06"
## CRO "180"  "1333" "428"  NA     "0.06" "0.12" "0.06" "0.05" "0.09" "0.05"
## FRS "1192" "512"  "1393" "1038" NA     "0.11" "0.07" "0.07" "0.06" "0.04"
## GES "1770" "3168" "1426" "1845" "2794" NA     "0.11" "0.12" "0.14" "0.11"
## GRA "534"  "1900" "656"  "711"  "1709" "1540" NA     "0.02" "0.10" "0.05"
## GRC "754"  "2007" "925"  "934"  "1877" "1675" "270"  NA     "0.10" "0.06"
## ITB "752"  "824"  "950"  "591"  "448"  "2365" "1279" "1467" NA     "0.07"
## ITP "224"  "1227" "604"  "201"  "1004" "1991" "705"  "884"  "583"  NA    
## ITR "585"  "862"  "899"  "477"  "654"  "2321" "1070" "1224" "302"  "371" 
## MAL "746"  "1233" "1157" "817"  "1270" "2395" "873"  "862"  "996"  "621" 
## POP "2320" "878"  "2591" "2197" "1225" "4010" "2777" "2876" "1645" "2103"
## ROS "771"  "1783" "445"  "683"  "1341" "1496" "1100" "1370" "961"  "884" 
## SER "442"  "1489" "316"  "322"  "1100" "1695" "884"  "1141" "672"  "520" 
## SLO "689"  "1059" "782"  "509"  "629"  "2166" "1220" "1440" "245"  "585" 
## SOC "1678" "3068" "1327" "1748" "2689" "109"  "1470" "1618" "2262" "1898"
## SPB "2263" "827"  "2569" "2158" "1264" "3995" "2687" "2760" "1644" "2040"
## SPC "1698" "310"  "2020" "1601" "818"  "3445" "2116" "2193" "1125" "1474"
## SPM "1450" "211"  "1782" "1360" "684"  "3204" "1864" "1944" "916"  "1227"
## SPS "2226" "874"  "2568" "2144" "1371" "3988" "2601" "2643" "1690" "2002"
## STS "1230" "916"  "1304" "1053" "412"  "2620" "1764" "1975" "536"  "1098"
## TUA "694"  "2123" "641"  "851"  "1885" "1265" "277"  "445"  "1441" "901" 
## TUH "1828" "3251" "1512" "1919" "2897" "194"  "1544" "1650" "2461" "2051"
##     ITR    MAL    POP    ROS    SER    SLO    SOC    SPB    SPC    SPM   
## ALD "0.08" "0.08" "0.09" "0.12" "0.22" "0.08" "0.14" "0.12" "0.13" "0.11"
## BAR "0.07" "0.08" "0.09" "0.12" "0.22" "0.08" "0.14" "0.12" "0.13" "0.11"
## BUL "0.05" "0.06" "0.07" "0.10" "0.20" "0.07" "0.13" "0.10" "0.10" "0.10"
## CRO "0.05" "0.06" "0.06" "0.10" "0.17" "0.06" "0.11" "0.09" "0.10" "0.09"
## FRS "0.04" "0.04" "0.04" "0.07" "0.16" "0.04" "0.10" "0.06" "0.07" "0.06"
## GES "0.10" "0.10" "0.10" "0.14" "0.23" "0.10" "0.01" "0.12" "0.13" "0.13"
## GRA "0.05" "0.05" "0.07" "0.10" "0.19" "0.05" "0.11" "0.09" "0.10" "0.09"
## GRC "0.06" "0.06" "0.07" "0.10" "0.19" "0.06" "0.12" "0.09" "0.11" "0.10"
## ITB "0.07" "0.06" "0.05" "0.10" "0.22" "0.06" "0.13" "0.08" "0.09" "0.09"
## ITP "0.02" "0.02" "0.04" "0.08" "0.17" "0.04" "0.10" "0.07" "0.08" "0.06"
## ITR NA     "0.02" "0.04" "0.08" "0.16" "0.04" "0.10" "0.08" "0.09" "0.07"
## MAL "694"  NA     "0.03" "0.07" "0.16" "0.04" "0.09" "0.07" "0.07" "0.06"
## POP "1734" "2067" NA     "0.07" "0.17" "0.04" "0.10" "0.05" "0.05" "0.05"
## ROS "1047" "1498" "2566" NA     "0.20" "0.06" "0.13" "0.09" "0.10" "0.09"
## SER "697"  "1139" "2316" "367"  NA     "0.15" "0.23" "0.20" "0.22" "0.21"
## SLO "455"  "1118" "1852" "726"  "476"  NA     "0.09" "0.05" "0.06" "0.05"
## SOC "2224" "2316" "3906" "1388" "1591" "2060" NA     "0.12" "0.13" "0.12"
## SPB "1683" "1923" "283"  "2594" "2313" "1869" "3895" NA     "0.01" "0.01"
## SPC "1124" "1368" "702"  "2086" "1781" "1363" "3346" "571"  NA     "0.03"
## SPM "883"  "1128" "939"  "1873" "1555" "1160" "3107" "823"  "252"  NA    
## SPS "1669" "1785" "615"  "2651" "2341" "1930" "3892" "332"  "567"  "787" 
## STS "835"  "1523" "1509" "1125" "989"  "546"  "2512" "1601" "1214" "1096"
## TUA "1271" "1148" "3001" "1057" "932"  "1340" "1198" "2925" "2354" "2103"
## TUH "2395" "2412" "4104" "1629" "1797" "2272" "283"  "4076" "3519" "3275"
##     SPS    STS    TUA    TUH   
## ALD "0.14" "0.10" "0.10" "0.09"
## BAR "0.11" "0.09" "0.06" "0.04"
## BUL "0.12" "0.08" "0.08" "0.07"
## CRO "0.11" "0.06" "0.07" "0.07"
## FRS "0.10" "0.03" "0.07" "0.05"
## GES "0.16" "0.11" "0.12" "0.11"
## GRA "0.10" "0.07" "0.07" "0.06"
## GRC "0.11" "0.07" "0.08" "0.07"
## ITB "0.13" "0.07" "0.10" "0.08"
## ITP "0.09" "0.04" "0.06" "0.04"
## ITR "0.07" "0.04" "0.04" "0.04"
## MAL "0.07" "0.04" "0.05" "0.04"
## POP "0.09" "0.04" "0.06" "0.05"
## ROS "0.13" "0.08" "0.10" "0.08"
## SER "0.24" "0.18" "0.20" "0.17"
## SLO "0.09" "0.04" "0.06" "0.04"
## SOC "0.15" "0.11" "0.12" "0.10"
## SPB "0.13" "0.06" "0.10" "0.08"
## SPC "0.14" "0.07" "0.11" "0.08"
## SPM "0.08" "0.07" "0.09" "0.07"
## SPS NA     "0.10" "0.03" "0.09"
## STS "1750" NA     "0.07" "0.05"
## TUA "2855" "1886" NA     "0.05"
## TUH "4054" "2745" "1267" NA
cities <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
cities <- as_tibble(cities)
head(cities)
## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

We can sort by distance

# Calculate row-wise mean distances (excluding diagonal)
row_means <- rowMeans(distance_matrix, na.rm=TRUE)

# Sort row names by mean distances
sorted_names_by_distance <- names(sort(row_means))

# Reorder distance_matrix and aa matrices based on these sorted names
distance_matrix <- distance_matrix[sorted_names_by_distance, sorted_names_by_distance]
aa <- aa[sorted_names_by_distance, sorted_names_by_distance]

# Your existing code to initialize and fill the merged_matrix
merged_matrix <- matrix(NA, nrow = nrow(aa), ncol = ncol(aa))
rownames(merged_matrix) <- sorted_names_by_distance
colnames(merged_matrix) <- sorted_names_by_distance

merged_matrix[upper.tri(merged_matrix, diag = FALSE)] <- aa[upper.tri(aa, diag = FALSE)]
merged_matrix[lower.tri(merged_matrix, diag = FALSE)] <- distance_matrix[lower.tri(distance_matrix, diag = FALSE)]

# Formatting code with absolute value for upper triangular part
for(i in 1:nrow(merged_matrix)) {
  for(j in 1:ncol(merged_matrix)) {
    if (i < j) {
      merged_matrix[i, j] <- sprintf("%.2f", abs(as.numeric(merged_matrix[i, j])))
    } else if (i > j) {
      merged_matrix[i, j] <- sprintf("%.0f", as.numeric(merged_matrix[i, j]))
    }
  }
}

# Print the merged matrix
print(merged_matrix)
##     CRO    ITP    ITR    ALD    ITB    SLO    SER    BUL    FRS    MAL   
## CRO NA     "0.05" "0.05" "0.05" "0.09" "0.06" "0.17" "0.09" "0.06" "0.06"
## ITP "201"  NA     "0.02" "0.08" "0.07" "0.04" "0.17" "0.06" "0.04" "0.02"
## ITR "477"  "371"  NA     "0.08" "0.07" "0.04" "0.16" "0.05" "0.04" "0.02"
## ALD "180"  "224"  "585"  NA     "0.12" "0.08" "0.22" "0.11" "0.09" "0.08"
## ITB "591"  "583"  "302"  "752"  NA     "0.06" "0.22" "0.10" "0.06" "0.06"
## SLO "509"  "585"  "455"  "689"  "245"  NA     "0.15" "0.07" "0.04" "0.04"
## SER "322"  "520"  "697"  "442"  "672"  "476"  NA     "0.20" "0.16" "0.16"
## BUL "428"  "604"  "899"  "414"  "950"  "782"  "316"  NA     "0.07" "0.06"
## FRS "1038" "1004" "654"  "1192" "448"  "629"  "1100" "1393" NA     "0.04"
## MAL "817"  "621"  "694"  "746"  "996"  "1118" "1139" "1157" "1270" NA    
## ROS "683"  "884"  "1047" "771"  "961"  "726"  "367"  "445"  "1341" "1498"
## GRA "711"  "705"  "1070" "534"  "1279" "1220" "884"  "656"  "1709" "873" 
## STS "1053" "1098" "835"  "1230" "536"  "546"  "989"  "1304" "412"  "1523"
## BAR "1333" "1227" "862"  "1446" "824"  "1059" "1489" "1742" "512"  "1233"
## SPM "1360" "1227" "883"  "1450" "916"  "1160" "1555" "1782" "684"  "1128"
## TUA "851"  "901"  "1271" "694"  "1441" "1340" "932"  "641"  "1885" "1148"
## GRC "934"  "884"  "1224" "754"  "1467" "1440" "1141" "925"  "1877" "862" 
## SPC "1601" "1474" "1124" "1698" "1125" "1363" "1781" "2020" "818"  "1368"
## SPB "2158" "2040" "1683" "2263" "1644" "1869" "2313" "2569" "1264" "1923"
## SPS "2144" "2002" "1669" "2226" "1690" "1930" "2341" "2568" "1371" "1785"
## POP "2197" "2103" "1734" "2320" "1645" "1852" "2316" "2591" "1225" "2067"
## SOC "1748" "1898" "2224" "1678" "2262" "2060" "1591" "1327" "2689" "2316"
## GES "1845" "1991" "2321" "1770" "2365" "2166" "1695" "1426" "2794" "2395"
## TUH "1919" "2051" "2395" "1828" "2461" "2272" "1797" "1512" "2897" "2412"
##     ROS    GRA    STS    BAR    SPM    TUA    GRC    SPC    SPB    SPS   
## CRO "0.10" "0.06" "0.06" "0.09" "0.09" "0.07" "0.05" "0.10" "0.09" "0.11"
## ITP "0.08" "0.05" "0.04" "0.08" "0.06" "0.06" "0.06" "0.08" "0.07" "0.09"
## ITR "0.08" "0.05" "0.04" "0.07" "0.07" "0.04" "0.06" "0.09" "0.08" "0.07"
## ALD "0.12" "0.08" "0.10" "0.12" "0.11" "0.10" "0.07" "0.13" "0.12" "0.14"
## ITB "0.10" "0.10" "0.07" "0.13" "0.09" "0.10" "0.10" "0.09" "0.08" "0.13"
## SLO "0.06" "0.05" "0.04" "0.08" "0.05" "0.06" "0.06" "0.06" "0.05" "0.09"
## SER "0.20" "0.19" "0.18" "0.22" "0.21" "0.20" "0.19" "0.22" "0.20" "0.24"
## BUL "0.10" "0.09" "0.08" "0.11" "0.10" "0.08" "0.09" "0.10" "0.10" "0.12"
## FRS "0.07" "0.07" "0.03" "0.09" "0.06" "0.07" "0.07" "0.07" "0.06" "0.10"
## MAL "0.07" "0.05" "0.04" "0.08" "0.06" "0.05" "0.06" "0.07" "0.07" "0.07"
## ROS NA     "0.10" "0.08" "0.12" "0.09" "0.10" "0.10" "0.10" "0.09" "0.13"
## GRA "1100" NA     "0.07" "0.09" "0.09" "0.07" "0.02" "0.10" "0.09" "0.10"
## STS "1125" "1764" NA     "0.09" "0.07" "0.07" "0.07" "0.07" "0.06" "0.10"
## BAR "1783" "1900" "916"  NA     "0.11" "0.06" "0.10" "0.13" "0.12" "0.11"
## SPM "1873" "1864" "1096" "211"  NA     "0.09" "0.10" "0.03" "0.01" "0.08"
## TUA "1057" "277"  "1886" "2123" "2103" NA     "0.08" "0.11" "0.10" "0.03"
## GRC "1370" "270"  "1975" "2007" "1944" "445"  NA     "0.11" "0.09" "0.11"
## SPC "2086" "2116" "1214" "310"  "252"  "2354" "2193" NA     "0.01" "0.14"
## SPB "2594" "2687" "1601" "827"  "823"  "2925" "2760" "571"  NA     "0.13"
## SPS "2651" "2601" "1750" "874"  "787"  "2855" "2643" "567"  "332"  NA    
## POP "2566" "2777" "1509" "878"  "939"  "3001" "2876" "702"  "283"  "615" 
## SOC "1388" "1470" "2512" "3068" "3107" "1198" "1618" "3346" "3895" "3892"
## GES "1496" "1540" "2620" "3168" "3204" "1265" "1675" "3445" "3995" "3988"
## TUH "1629" "1544" "2745" "3251" "3275" "1267" "1650" "3519" "4076" "4054"
##     POP    SOC    GES    TUH   
## CRO "0.06" "0.11" "0.12" "0.07"
## ITP "0.04" "0.10" "0.11" "0.04"
## ITR "0.04" "0.10" "0.10" "0.04"
## ALD "0.09" "0.14" "0.14" "0.09"
## ITB "0.05" "0.13" "0.14" "0.08"
## SLO "0.04" "0.09" "0.10" "0.04"
## SER "0.17" "0.23" "0.23" "0.17"
## BUL "0.07" "0.13" "0.13" "0.07"
## FRS "0.04" "0.10" "0.11" "0.05"
## MAL "0.03" "0.09" "0.10" "0.04"
## ROS "0.07" "0.13" "0.14" "0.08"
## GRA "0.07" "0.11" "0.11" "0.06"
## STS "0.04" "0.11" "0.11" "0.05"
## BAR "0.09" "0.14" "0.14" "0.04"
## SPM "0.05" "0.12" "0.13" "0.07"
## TUA "0.06" "0.12" "0.12" "0.05"
## GRC "0.07" "0.12" "0.12" "0.07"
## SPC "0.05" "0.13" "0.13" "0.08"
## SPB "0.05" "0.12" "0.12" "0.08"
## SPS "0.09" "0.15" "0.16" "0.09"
## POP NA     "0.10" "0.10" "0.05"
## SOC "3906" NA     "0.01" "0.10"
## GES "4010" "109"  NA     "0.11"
## TUH "4104" "283"  "194"  NA

Make a table and save it as a word document

# Convert the matrix to a data frame and add a column with row names
merged_df <- as.data.frame(merged_matrix)
merged_df$Population <- rownames(merged_matrix)

# Reorder columns to have RowNames as the first column
merged_df <- merged_df[, c("Population", colnames(merged_matrix))]

merged_df1 <- as.data.frame(merged_df)
write.csv(merged_df1, "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/merged_df_MAF01_overlap.csv")

# Create a flextable object from the merged_matrix
ft <- qflextable(as.data.frame(merged_df)) 

ft

Population

CRO

ITP

ITR

ALD

ITB

SLO

SER

BUL

FRS

MAL

ROS

GRA

STS

BAR

SPM

TUA

GRC

SPC

SPB

SPS

POP

SOC

GES

TUH

CRO

0.05

0.05

0.05

0.09

0.06

0.17

0.09

0.06

0.06

0.10

0.06

0.06

0.09

0.09

0.07

0.05

0.10

0.09

0.11

0.06

0.11

0.12

0.07

ITP

201

0.02

0.08

0.07

0.04

0.17

0.06

0.04

0.02

0.08

0.05

0.04

0.08

0.06

0.06

0.06

0.08

0.07

0.09

0.04

0.10

0.11

0.04

ITR

477

371

0.08

0.07

0.04

0.16

0.05

0.04

0.02

0.08

0.05

0.04

0.07

0.07

0.04

0.06

0.09

0.08

0.07

0.04

0.10

0.10

0.04

ALD

180

224

585

0.12

0.08

0.22

0.11

0.09

0.08

0.12

0.08

0.10

0.12

0.11

0.10

0.07

0.13

0.12

0.14

0.09

0.14

0.14

0.09

ITB

591

583

302

752

0.06

0.22

0.10

0.06

0.06

0.10

0.10

0.07

0.13

0.09

0.10

0.10

0.09

0.08

0.13

0.05

0.13

0.14

0.08

SLO

509

585

455

689

245

0.15

0.07

0.04

0.04

0.06

0.05

0.04

0.08

0.05

0.06

0.06

0.06

0.05

0.09

0.04

0.09

0.10

0.04

SER

322

520

697

442

672

476

0.20

0.16

0.16

0.20

0.19

0.18

0.22

0.21

0.20

0.19

0.22

0.20

0.24

0.17

0.23

0.23

0.17

BUL

428

604

899

414

950

782

316

0.07

0.06

0.10

0.09

0.08

0.11

0.10

0.08

0.09

0.10

0.10

0.12

0.07

0.13

0.13

0.07

FRS

1038

1004

654

1192

448

629

1100

1393

0.04

0.07

0.07

0.03

0.09

0.06

0.07

0.07

0.07

0.06

0.10

0.04

0.10

0.11

0.05

MAL

817

621

694

746

996

1118

1139

1157

1270

0.07

0.05

0.04

0.08

0.06

0.05

0.06

0.07

0.07

0.07

0.03

0.09

0.10

0.04

ROS

683

884

1047

771

961

726

367

445

1341

1498

0.10

0.08

0.12

0.09

0.10

0.10

0.10

0.09

0.13

0.07

0.13

0.14

0.08

GRA

711

705

1070

534

1279

1220

884

656

1709

873

1100

0.07

0.09

0.09

0.07

0.02

0.10

0.09

0.10

0.07

0.11

0.11

0.06

STS

1053

1098

835

1230

536

546

989

1304

412

1523

1125

1764

0.09

0.07

0.07

0.07

0.07

0.06

0.10

0.04

0.11

0.11

0.05

BAR

1333

1227

862

1446

824

1059

1489

1742

512

1233

1783

1900

916

0.11

0.06

0.10

0.13

0.12

0.11

0.09

0.14

0.14

0.04

SPM

1360

1227

883

1450

916

1160

1555

1782

684

1128

1873

1864

1096

211

0.09

0.10

0.03

0.01

0.08

0.05

0.12

0.13

0.07

TUA

851

901

1271

694

1441

1340

932

641

1885

1148

1057

277

1886

2123

2103

0.08

0.11

0.10

0.03

0.06

0.12

0.12

0.05

GRC

934

884

1224

754

1467

1440

1141

925

1877

862

1370

270

1975

2007

1944

445

0.11

0.09

0.11

0.07

0.12

0.12

0.07

SPC

1601

1474

1124

1698

1125

1363

1781

2020

818

1368

2086

2116

1214

310

252

2354

2193

0.01

0.14

0.05

0.13

0.13

0.08

SPB

2158

2040

1683

2263

1644

1869

2313

2569

1264

1923

2594

2687

1601

827

823

2925

2760

571

0.13

0.05

0.12

0.12

0.08

SPS

2144

2002

1669

2226

1690

1930

2341

2568

1371

1785

2651

2601

1750

874

787

2855

2643

567

332

0.09

0.15

0.16

0.09

POP

2197

2103

1734

2320

1645

1852

2316

2591

1225

2067

2566

2777

1509

878

939

3001

2876

702

283

615

0.10

0.10

0.05

SOC

1748

1898

2224

1678

2262

2060

1591

1327

2689

2316

1388

1470

2512

3068

3107

1198

1618

3346

3895

3892

3906

0.01

0.10

GES

1845

1991

2321

1770

2365

2166

1695

1426

2794

2395

1496

1540

2620

3168

3204

1265

1675

3445

3995

3988

4010

109

0.11

TUH

1919

2051

2395

1828

2461

2272

1797

1512

2897

2412

1629

1544

2745

3251

3275

1267

1650

3519

4076

4054

4104

283

194

7.4. Mantel test with Europe Set 3 SNPs overlapping with microsats

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe
plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/overlap/r2_0.01_b \
--keep-fam output/fst/pops_4bfst_overlap2.txt \
--make-bed \
--out output/fst/overlap/mantel_overlap \
--silent;
grep 'samples\|variants\|remaining' output/fst/overlap/mantel_overlap.log

20968 variants loaded from .bim file. –keep-fam: 242 people remaining. Total genotyping rate in remaining samples is 0.973943. 20968 variants and 242 people pass filters and QC.

Then convert to raw format

plink \
--allow-extra-chr \
--keep-allele-order \
--bfile output/fst/overlap/mantel_overlap \
--recodeA \
--out output/fst/overlap/mantel_overlap \
--silent;
grep 'samples\|variants\|remaining' output/fst/overlap/mantel_overlap.log

20968 variants loaded from .bim file. 20968 variants and 242 people pass filters and QC.

Import the data and covert it to genind format

# import the data
albo <-
  read.PLINK(
    here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/mantel_overlap.raw"),
    quiet = FALSE,
    chunkSize = 1000,
    parallel = require("parallel"),
    n.cores = 4
  )

# convert to genind
albo2 <- gl2gi(albo, probar = TRUE, verbose = NULL)

Save

saveRDS(albo2, here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/albo2_overlap_MAF01.rds"
))

Load it

albo2 <- readRDS(here(
 "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/albo2_overlap_MAF01.rds"
))

Import Sample locations

sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/mantel_overlap.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# # Replace the BEN a with BEN b (remember to never name samples with the same ID... I change it manually in the fam file.)
# fam_data <- fam_data %>% 
#   mutate(IndividualID = ifelse(FamilyID == "BEN" & IndividualID == "a", "b", IndividualID))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Merge

# Join with sampling_loc to get sampling localities
loc_albo <- fam_data |>
  left_join(sampling_loc, by = c("FamilyID" = "Abbreviation"))

head(loc_albo)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype Pop_City Country
## 1      SOC         1065          0          0   0        -9    Sochi  Russia
## 2      SOC         1066          0          0   0        -9    Sochi  Russia
## 3      SOC         1067          0          0   0        -9    Sochi  Russia
## 4      SOC         1068          0          0   0        -9    Sochi  Russia
## 5      SOC         1069          0          0   0        -9    Sochi  Russia
## 6      SOC         1070          0          0   0        -9    Sochi  Russia
##   Latitude Longitude Continent Year         Region Marker order
## 1 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 2 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 3 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 4 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 5 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46
## 6 43.60042  39.74533    Europe 2021 Eastern Europe   SNPs    46

Get the latitude and longitude

albo_dist2 <- cbind(loc_albo$Latitude, loc_albo$Longitude)
head(albo_dist2)
##          [,1]     [,2]
## [1,] 43.60042 39.74533
## [2,] 43.60042 39.74533
## [3,] 43.60042 39.74533
## [4,] 43.60042 39.74533
## [5,] 43.60042 39.74533
## [6,] 43.60042 39.74533
colnames(albo_dist2)<- c("x","y") 

Add jitter

albo_dist2 <- jitter(albo_dist2, factor = 1, amount = NULL)
head(albo_dist2)
##             x        y
## [1,] 43.60075 39.75428
## [2,] 43.60325 39.75480
## [3,] 43.59532 39.75272
## [4,] 43.59759 39.74487
## [5,] 43.60104 39.73846
## [6,] 43.59365 39.75122

Add to object

albo2$other$xy <- albo_dist2

Save

saveRDS(
  albo2,
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/albo2_SNP_overlap_MAF1.rds"
  )
)

7.4.1 Isolation By Distance for Europe (Set 3) SNPs overlapping with microsats

Convert the data

toto <- genind2genpop(albo2)
## 
##  Converting data from a genind to a genpop object... 
## 
## ...done.

Get 1 mosquito per population, it is just to get the geographical coordinates

unique_populations <- unique(albo2@pop)
selected_individuals <- integer(length(unique_populations))
for (i in seq_along(unique_populations)) {
  inds_in_pop <- which(albo2@pop == unique_populations[i])
  selected_individuals[i] <- sample(inds_in_pop, 1)
}
albo2_subset <- albo2[selected_individuals, ]

Mantel test

Dgen <- dist.genpop(toto,method=2)
Dgeo <- dist(albo2_subset$other$xy)
ibd <- mantel.randtest(Dgen,Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)
## 
## Observation: -0.09970885 
## 
## Based on 999 replicates
## Simulated p-value: 0.754 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
## -0.749669092 -0.003145698  0.016591424

Monte-Carlo test Call: mantel.randtest(m1 = Dgen, m2 = Dgeo)

Observation: -0.09965469

Based on 999 replicates Simulated p-value: 0.753 Alternative hypothesis: greater

Std.Obs Expectation    Variance 

-0.76056551 -0.00085997 0.01687307

Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/simIBD_SNPs_MAF01_overlap.pdf"))
plot(ibd)
dev.off()
plot(ibd)

plot(Dgeo, Dgen)
# A linear regression model (lm stands for "linear model") is fitted, with the genetic distances (Dgen) as the response variable and the geographic distances (Dgeo) as the predictor. The distances are transformed into vectors using as.vector because the dist function produces a matrix-like structure, but the linear regression function lm requires vectors.
dist_lm <- lm(as.vector(Dgen) ~ as.vector(Dgeo))
abline(dist_lm, col="red", lty=2)

Save Plot

# Plot it
# Start the PDF device
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/Genetic_v_Geog_distance_SNPs_MAF01_overlap.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Save Plot without equation

# Plot it
# Start the PDF device
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/Genetic_v_Geog_distance_SNPs_MAF01_overlap_noequ.pdf"))
plot(Dgeo, Dgen, main = "Genetic Distance vs Geographic Distance")
abline(dist_lm, col="red", lty=2)

# Extracting the coefficients from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Generating the equation string
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

#text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.85, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

dev.off()

Use library MASS for plot

library(MASS)

dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
#myPal <-
#  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
myPal <-
  colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

title("Isolation by distance")

Save plot

library(MASS)
CairoPDF(here(
     "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/IDB_PlotFromMASS_density_SNPs_MAF01_overlap_purple.pdf"))
dens <- kde2d(as.vector(Dgeo), as.vector(Dgen), n = 500)
#myPal <-
#  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
myPal <-
 colorRampPalette(c("white", "purple", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.2fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
#text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")

dev.off()

Save plot with equation

png(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/IDB_PlotFromMASS_density_SNPs_MAF01_overlap_equation.png"),
    width = 5,
    height = 4,
    units='in',
    res = 300)
myPal <-
  colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(Dgeo, Dgen, pch = 20, cex = .3, bty = "n")
image(dens, col = transp(myPal(300), .7), add = TRUE)
abline(dist_lm)
# Extracting the coefficients and R^2 from the linear model
intercept <- coef(dist_lm)[1]
slope <- coef(dist_lm)[2]
r2 <- summary(dist_lm)$r.squared

# Constructing the equation and R^2 strings
equation <- sprintf("y = %.4fx + %.2f", slope, intercept)
r2_label <- sprintf("R^2 = %.2f", r2)

# Adding the equation and R^2 to the plot
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.95, labels = equation)
text(x = max(as.vector(Dgeo)) * 0.8, y = max(as.vector(Dgen)) * 0.90, labels = r2_label)

title("Isolation by distance")
dev.off()