Plots of structure output for euro_global dataset
1. Load the standard packages
2. K=20 LEA maps for SNP Set 2 (r2<0.1) for Europe global dataset
- 2.1 Pie plots for LEA using SNP Set 2 (r2<0.1)
3. K=5 LEA maps for SNP Set 2 (r1<0.1) for Europe global dataset
- 3.1 Pie plots for LEA, k=5, SNP Set 2 (r2<0.1)
4. K=19 fastStructure simple maps for SNP Set 2 (r2<0.1) for Europe global dataset
- 4.1 Import samples attributes
  - 4.1.1 Import the Q matrix (K19 for fastStructure)
- 4.2 Q-values
SNP Set 3 (MAF 1%) plots
5. LEA SNP Set 3 (MAF 1%) for euro_global dataset
6. K=7 for global SNP Set 3
- 6.1 Pie plots for LEA, k=7 for SNP Set 3 (r2<0.01,MAF 1%)
- 6.2 K=6 for global SNP Set 3
  - 6.2.1 Pie plots for LEA, K=6 for SNP Set 3(r2<0.01, MAF 1%)

Plots of structure output for euro_global dataset

1. Load the standard packages

#install.packages(c("raster", "sf", "tidyverse", "fields", "downloader"))

library(raster)  # important to load before tidyverse, otherwise it masks select()
library(tidyverse)
library(sf)
library(ggspatial)
library(ggplot2)
library(dplyr)
library(colorout)
library(here)
library(extrafont)

2. K=20 LEA maps for SNP Set 2 (r2<0.1) for Europe global dataset

Change to select matrix from best LEA run

leak20 <- read_delim(
  here("euro_global/output/snps_sets/r2_0.1.snmf/K20/run1/r2_0.1_r1.20.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(leak20)

## # A tibble: 6 × 20
##      X1       X2      X3      X4      X5      X6      X7      X8      X9     X10
##   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.467  5.65e-3 2.91e-2 2.08e-2 2.06e-2 2.30e-2 7.48e-2 7.89e-3 5.27e-2 3.98e-2
## 2 0.311  2.01e-2 8.34e-2 1.82e-2 2.75e-2 2.10e-2 1.22e-1 1.00e-4 4.56e-2 1.27e-2
## 3 0.990  9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## 4 0.845  9.99e-5 3.00e-4 9.99e-5 6.99e-3 9.99e-5 2.16e-2 6.00e-2 9.99e-5 1.42e-2
## 5 0.878  9.99e-5 9.99e-5 9.99e-5 1.74e-2 4.22e-3 1.23e-2 5.27e-2 2.50e-3 1.17e-2
## 6 0.998  9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## # ℹ 10 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>

The fam file

fam_file <- here(
  "euro_global/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Change ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

leak20 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak20)

head(leak20)

##    ind pop       X1          X2          X3          X4          X5          X6
## 1 1001 OKI 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02 2.29631e-02
## 2 1002 OKI 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02 2.10350e-02
## 3 1003 OKI 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1004 OKI 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03 9.99280e-05
## 5 1005 OKI 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02 4.21508e-03
## 6 1006 OKI 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##            X7          X8          X9         X10         X11         X12
## 1 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03 1.01131e-02
## 2 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02 9.99730e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03 1.23783e-02
## 5 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##           X13         X14         X15         X16         X17         X18
## 1 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03 5.56498e-02
## 2 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02 4.49698e-02
## 3 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05 9.99280e-05
## 5 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##           X19         X20
## 1 1.78205e-03 4.34181e-03
## 2 7.05643e-03 9.99730e-05
## 3 9.98380e-05 9.98380e-05
## 4 1.36630e-02 9.99280e-05
## 5 1.01074e-02 9.99262e-05
## 6 9.98289e-05 9.98289e-05

Rename the columns

# Rename the columns starting from the third one
leak20 <- leak20 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak20)

##    ind pop       v1          v2          v3          v4          v5          v6
## 1 1001 OKI 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02 2.29631e-02
## 2 1002 OKI 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02 2.10350e-02
## 3 1003 OKI 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1004 OKI 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03 9.99280e-05
## 5 1005 OKI 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02 4.21508e-03
## 6 1006 OKI 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##            v7          v8          v9         v10         v11         v12
## 1 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03 1.01131e-02
## 2 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02 9.99730e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03 1.23783e-02
## 5 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##           v13         v14         v15         v16         v17         v18
## 1 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03 5.56498e-02
## 2 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02 4.49698e-02
## 3 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05 9.99280e-05
## 5 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##           v19         v20
## 1 1.78205e-03 4.34181e-03
## 2 7.05643e-03 9.99730e-05
## 3 9.98380e-05 9.98380e-05
## 4 1.36630e-02 9.99280e-05
## 5 1.01074e-02 9.99262e-05
## 6 9.98289e-05 9.98289e-05

Import samples attributes

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)

selected<-subset (sampling_loc, !is.na(sampling_loc[,10]))

pops <- selected |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

pops <- pops |> filter(!is.na(order))

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
   dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

pops <- pops |> filter(!is.na(order))

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

Merge with pops

# Add an index column to Q_tibble
leak20$index <- seq_len(nrow(leak20))

# Perform the merge as before
df1 <-
  merge(
    leak20,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1          v2          v3          v4          v5
## 411 OKI 1001 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02
## 412 OKI 1002 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02
## 413 OKI 1003 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 OKI 1004 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03
## 415 OKI 1005 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02
## 416 OKI 1006 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##              v6          v7          v8          v9         v10         v11
## 411 2.29631e-02 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03
## 412 2.10350e-02 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.99280e-05 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03
## 415 4.21508e-03 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##             v12         v13         v14         v15         v16         v17
## 411 1.01131e-02 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03
## 412 9.99730e-05 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05
## 414 1.23783e-02 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05
## 415 9.99262e-05 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##             v18         v19         v20 Latitude Longitude Pop_City Country
## 411 5.56498e-02 1.78205e-03 4.34181e-03  26.5013  127.9454  Okinawa   Japan
## 412 4.49698e-02 7.05643e-03 9.99730e-05  26.5013  127.9454  Okinawa   Japan
## 413 9.98380e-05 9.98380e-05 9.98380e-05  26.5013  127.9454  Okinawa   Japan
## 414 9.99280e-05 1.36630e-02 9.99280e-05  26.5013  127.9454  Okinawa   Japan
## 415 9.99262e-05 1.01074e-02 9.99262e-05  26.5013  127.9454  Okinawa   Japan
## 416 9.98289e-05 9.98289e-05 9.98289e-05  26.5013  127.9454  Okinawa   Japan
##        Region Subregion  Year order
## 411 East Asia           2018?    54
## 412 East Asia           2018?    54
## 413 East Asia           2018?    54
## 414 East Asia           2018?    54
## 415 East Asia           2018?    54
## 416 East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

Load libraries for pie-chart maps

# Load necessary libraries
library(ggplot2)
library(scatterpie)

## Registered S3 method overwritten by 'ggforce':
##   method           from 
##   scale_type.units units

## 
## Attaching package: 'scatterpie'

## The following object is masked from 'package:sp':
## 
##     recenter

library(sf)
library(rnaturalearth)
library(rnaturalearthdata)

## 
## Attaching package: 'rnaturalearthdata'

## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(ggrepel)
library(Cairo)

Create color palette

color_palette_20 <-
  c(
    "#FF8C1A",
    "#B20CC9",
    "#77DD77",
    "#1E90FF",
    "#B22222",
    "green",
    "yellow2",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "#AE8333",
    "purple",
    "orangered"
  )

2.1 Pie plots for LEA using SNP Set 2 (r2<0.1)

2.1.1 Make pie plot showing all continents

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_20) +
  guides(fill = "none") +  # Hide legend
  coord_sf() +
  #coord_sf(xlim = c(-90, 145), ylim = c(-30, 60)) +
  my_theme()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k20_euro_global_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.1.2 Make pie plots just for Europe

with SNP Set 2 LEA matrix

df1 <- df1 |>
filter(
    Region == "Eastern Europe" | Region == "Southern Europe" | Region == "Western Europe"
  )

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_20) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k20_euro_global_EUROPE_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.1.2 Make pie plot showing just Asia

Filter out Asia from the matrix

# Add an index column to Q_tibble
leak20$index <- seq_len(nrow(leak20))

# Perform the merge as before
df1 <-
  merge(
    leak20,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1          v2          v3          v4          v5
## 411 OKI 1001 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02
## 412 OKI 1002 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02
## 413 OKI 1003 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 OKI 1004 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03
## 415 OKI 1005 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02
## 416 OKI 1006 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##              v6          v7          v8          v9         v10         v11
## 411 2.29631e-02 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03
## 412 2.10350e-02 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.99280e-05 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03
## 415 4.21508e-03 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##             v12         v13         v14         v15         v16         v17
## 411 1.01131e-02 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03
## 412 9.99730e-05 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05
## 414 1.23783e-02 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05
## 415 9.99262e-05 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##             v18         v19         v20 Latitude Longitude Pop_City Country
## 411 5.56498e-02 1.78205e-03 4.34181e-03  26.5013  127.9454  Okinawa   Japan
## 412 4.49698e-02 7.05643e-03 9.99730e-05  26.5013  127.9454  Okinawa   Japan
## 413 9.98380e-05 9.98380e-05 9.98380e-05  26.5013  127.9454  Okinawa   Japan
## 414 9.99280e-05 1.36630e-02 9.99280e-05  26.5013  127.9454  Okinawa   Japan
## 415 9.99262e-05 1.01074e-02 9.99262e-05  26.5013  127.9454  Okinawa   Japan
## 416 9.98289e-05 9.98289e-05 9.98289e-05  26.5013  127.9454  Okinawa   Japan
##        Region Subregion  Year order
## 411 East Asia           2018?    54
## 412 East Asia           2018?    54
## 413 East Asia           2018?    54
## 414 East Asia           2018?    54
## 415 East Asia           2018?    54
## 416 East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

df1 <- df1 |>
filter(
    Region == "East Asia" | Region == "South Asia" | Region == "Southeast Asia"
  )

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_20) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
  my_theme()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k20_euro_global_ASIA_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.1.3 Make pie plot showing just Americas

Filter out America from the matrix

# Add an index column to Q_tibble
leak20$index <- seq_len(nrow(leak20))

# Perform the merge as before
df1 <-
  merge(
    leak20,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1          v2          v3          v4          v5
## 411 OKI 1001 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02
## 412 OKI 1002 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02
## 413 OKI 1003 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 OKI 1004 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03
## 415 OKI 1005 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02
## 416 OKI 1006 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##              v6          v7          v8          v9         v10         v11
## 411 2.29631e-02 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03
## 412 2.10350e-02 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.99280e-05 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03
## 415 4.21508e-03 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##             v12         v13         v14         v15         v16         v17
## 411 1.01131e-02 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03
## 412 9.99730e-05 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05
## 414 1.23783e-02 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05
## 415 9.99262e-05 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
##             v18         v19         v20 Latitude Longitude Pop_City Country
## 411 5.56498e-02 1.78205e-03 4.34181e-03  26.5013  127.9454  Okinawa   Japan
## 412 4.49698e-02 7.05643e-03 9.99730e-05  26.5013  127.9454  Okinawa   Japan
## 413 9.98380e-05 9.98380e-05 9.98380e-05  26.5013  127.9454  Okinawa   Japan
## 414 9.99280e-05 1.36630e-02 9.99280e-05  26.5013  127.9454  Okinawa   Japan
## 415 9.99262e-05 1.01074e-02 9.99262e-05  26.5013  127.9454  Okinawa   Japan
## 416 9.98289e-05 9.98289e-05 9.98289e-05  26.5013  127.9454  Okinawa   Japan
##        Region Subregion  Year order
## 411 East Asia           2018?    54
## 412 East Asia           2018?    54
## 413 East Asia           2018?    54
## 414 East Asia           2018?    54
## 415 East Asia           2018?    54
## 416 East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

df1 <- df1 |>
filter(
    Region == "North America" | Region == "South America"
  )

world <- ne_countries(scale = "medium", returnclass = "sf")

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns", 
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19","v20"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_20) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-120, -15), ylim = c(-30, 60)) +
  my_theme()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k20_euro_global_Americas_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3. K=5 LEA maps for SNP Set 2 (r1<0.1) for Europe global dataset

replace with best Q matrix

leak5 <- read_delim(
  here("euro_global/output/snps_sets/r2_0.1.snmf/K5/run1/r2_0.1_r1.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak5)

## # A tibble: 6 × 5
##         X1    X2    X3     X4    X5
##      <dbl> <dbl> <dbl>  <dbl> <dbl>
## 1 0.0100   0.432 0.201 0.137  0.220
## 2 0.0200   0.435 0.204 0.121  0.220
## 3 0.0157   0.464 0.175 0.147  0.198
## 4 0.000100 0.452 0.226 0.0883 0.233
## 5 0.000100 0.462 0.199 0.109  0.230
## 6 0.0234   0.457 0.192 0.116  0.211

The fam file

fam_file <- here(
  "euro_global/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Change ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

leak5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak5)

head(leak5)

##    ind pop         X1       X2       X3        X4       X5
## 1 1001 OKI 0.01004940 0.432031 0.201044 0.1370630 0.219812
## 2 1002 OKI 0.01995590 0.434927 0.203757 0.1211740 0.220186
## 3 1003 OKI 0.01567250 0.464427 0.174829 0.1474360 0.197636
## 4 1004 OKI 0.00009999 0.452051 0.226127 0.0883153 0.233406
## 5 1005 OKI 0.00009999 0.462282 0.199097 0.1087080 0.229812
## 6 1006 OKI 0.02340520 0.457148 0.191711 0.1164300 0.211305

Rename the columns

# Rename the columns starting from the third one
leak5 <- leak5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak5)

##    ind pop         v1       v2       v3        v4       v5
## 1 1001 OKI 0.01004940 0.432031 0.201044 0.1370630 0.219812
## 2 1002 OKI 0.01995590 0.434927 0.203757 0.1211740 0.220186
## 3 1003 OKI 0.01567250 0.464427 0.174829 0.1474360 0.197636
## 4 1004 OKI 0.00009999 0.452051 0.226127 0.0883153 0.233406
## 5 1005 OKI 0.00009999 0.462282 0.199097 0.1087080 0.229812
## 6 1006 OKI 0.02340520 0.457148 0.191711 0.1164300 0.211305

Import samples attributes for euro_global

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
   dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

pops <- pops |> filter(!is.na(order))

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

Merge with pops

# Add an index column to Q_tibble
leak5$index <- seq_len(nrow(leak5))

# Perform the merge as before
df1 <-
  merge(
    leak5,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind         v1       v2       v3        v4       v5 Latitude Longitude
## 411 OKI 1001 0.01004940 0.432031 0.201044 0.1370630 0.219812  26.5013  127.9454
## 412 OKI 1002 0.01995590 0.434927 0.203757 0.1211740 0.220186  26.5013  127.9454
## 413 OKI 1003 0.01567250 0.464427 0.174829 0.1474360 0.197636  26.5013  127.9454
## 414 OKI 1004 0.00009999 0.452051 0.226127 0.0883153 0.233406  26.5013  127.9454
## 415 OKI 1005 0.00009999 0.462282 0.199097 0.1087080 0.229812  26.5013  127.9454
## 416 OKI 1006 0.02340520 0.457148 0.191711 0.1164300 0.211305  26.5013  127.9454
##     Pop_City Country    Region Subregion  Year order
## 411  Okinawa   Japan East Asia           2018?    54
## 412  Okinawa   Japan East Asia           2018?    54
## 413  Okinawa   Japan East Asia           2018?    54
## 414  Okinawa   Japan East Asia           2018?    54
## 415  Okinawa   Japan East Asia           2018?    54
## 416  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

Make a new palette with only 5 colors

color_palette_5 <-
  c(
    "#1E90FF",
    "#77DD37",
    "#FF8C1A",
    "#FFFF19",
    "purple3" 
     )

3.1 Pie plots for LEA, k=5, SNP Set 2 (r2<0.1)

3.1.1 Pie plots showing all continents

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_5) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-20, 150), ylim = c(-30, 60)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k5_euro_global_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1 2 Pie plots for Europe

df1 <- df1 |>
filter(
    Region == "Eastern Europe" | Region == "Southern Europe" | Region == "Western Europe"
  )

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns", 
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_5) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-15, 50), ylim = c(30, 55)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k5_euro_global_EUROPE_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1.3 Pie plots for just Asia

Filter out Asia from the matrix

# Perform the merge as before
df1 <-
  merge(
    leak5,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind         v1       v2       v3        v4       v5 Latitude Longitude
## 411 OKI 1001 0.01004940 0.432031 0.201044 0.1370630 0.219812  26.5013  127.9454
## 412 OKI 1002 0.01995590 0.434927 0.203757 0.1211740 0.220186  26.5013  127.9454
## 413 OKI 1003 0.01567250 0.464427 0.174829 0.1474360 0.197636  26.5013  127.9454
## 414 OKI 1004 0.00009999 0.452051 0.226127 0.0883153 0.233406  26.5013  127.9454
## 415 OKI 1005 0.00009999 0.462282 0.199097 0.1087080 0.229812  26.5013  127.9454
## 416 OKI 1006 0.02340520 0.457148 0.191711 0.1164300 0.211305  26.5013  127.9454
##     Pop_City Country    Region Subregion  Year order
## 411  Okinawa   Japan East Asia           2018?    54
## 412  Okinawa   Japan East Asia           2018?    54
## 413  Okinawa   Japan East Asia           2018?    54
## 414  Okinawa   Japan East Asia           2018?    54
## 415  Okinawa   Japan East Asia           2018?    54
## 416  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

df1 <- df1 |>
filter(
    Region == "East Asia" | Region == "South Asia" | Region == "Southeast Asia"
  )

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)


world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns", 
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_5) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(60, 150), ylim = c(-10, 55)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k5_euro_global_Asia_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1.4 Pie plots for just Americas

# Perform the merge as before
df1 <-
  merge(
    leak5,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind         v1       v2       v3        v4       v5 Latitude Longitude
## 411 OKI 1001 0.01004940 0.432031 0.201044 0.1370630 0.219812  26.5013  127.9454
## 412 OKI 1002 0.01995590 0.434927 0.203757 0.1211740 0.220186  26.5013  127.9454
## 413 OKI 1003 0.01567250 0.464427 0.174829 0.1474360 0.197636  26.5013  127.9454
## 414 OKI 1004 0.00009999 0.452051 0.226127 0.0883153 0.233406  26.5013  127.9454
## 415 OKI 1005 0.00009999 0.462282 0.199097 0.1087080 0.229812  26.5013  127.9454
## 416 OKI 1006 0.02340520 0.457148 0.191711 0.1164300 0.211305  26.5013  127.9454
##     Pop_City Country    Region Subregion  Year order
## 411  Okinawa   Japan East Asia           2018?    54
## 412  Okinawa   Japan East Asia           2018?    54
## 413  Okinawa   Japan East Asia           2018?    54
## 414  Okinawa   Japan East Asia           2018?    54
## 415  Okinawa   Japan East Asia           2018?    54
## 416  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

Filter out all but Americas from the matrix

df1 <- df1 |>
filter(
    Region == "North America" | Region == "South America"
  )

Make pie plots

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_5) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-130, -15), ylim = c(-35, 50)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k5_euro_global_Americas_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4. K=19 fastStructure simple maps for SNP Set 2 (r2<0.1) for Europe global dataset

4.1 Import samples attributes

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)

selected<-subset (sampling_loc, !is.na(sampling_loc[,10]))

pops <- selected |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

4.1.1 Import the Q matrix (K19 for fastStructure)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k19run4 <- read_delim(
  here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run004/simple.19.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k19run4 <- k19run4 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k19run4)

## # A tibble: 6 × 19
##      X1    X2    X3    X4    X5     X6    X7    X8    X9    X10   X11   X12
##   <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0     0     0     0 0.264 0          0     0     0 0.0450     0     0
## 2     0     0     0     0 0     0.0669     0     0     0 0          0     0
## 3     0     0     0     0 0     0          0     0     0 0          0     0
## 4     0     0     0     0 0     0          0     0     0 0          0     0
## 5     0     0     0     0 0     0          0     0     0 0          0     0
## 6     0     0     0     0 0     0          0     0     0 0          0     0
## # ℹ 7 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

k19run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k19run4)

head(k19run4)

##    ind pop X1 X2 X3 X4     X5       X6 X7 X8 X9      X10 X11 X12      X13
## 1 1001 OKI  0  0  0  0 0.2643 0.000000  0  0  0 0.044959   0   0 0.078525
## 2 1002 OKI  0  0  0  0 0.0000 0.066895  0  0  0 0.000000   0   0 0.000000
## 3 1003 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 4 1004 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 5 1005 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 6 1006 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
##        X14      X15 X16      X17      X18 X19
## 1 0.612210 0.000000   0 0.000000 0.000000   0
## 2 0.449903 0.219687   0 0.172857 0.090652   0
## 3 0.999991 0.000000   0 0.000000 0.000000   0
## 4 0.999991 0.000000   0 0.000000 0.000000   0
## 5 0.999991 0.000000   0 0.000000 0.000000   0
## 6 0.999991 0.000000   0 0.000000 0.000000   0

Rename the columns

# Rename the columns starting from the third one
k19run4 <- k19run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k19run4)

##    ind pop v1 v2 v3 v4     v5       v6 v7 v8 v9      v10 v11 v12      v13
## 1 1001 OKI  0  0  0  0 0.2643 0.000000  0  0  0 0.044959   0   0 0.078525
## 2 1002 OKI  0  0  0  0 0.0000 0.066895  0  0  0 0.000000   0   0 0.000000
## 3 1003 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 4 1004 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 5 1005 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 6 1006 OKI  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
##        v14      v15 v16      v17      v18 v19
## 1 0.612210 0.000000   0 0.000000 0.000000   0
## 2 0.449903 0.219687   0 0.172857 0.090652   0
## 3 0.999991 0.000000   0 0.000000 0.000000   0
## 4 0.999991 0.000000   0 0.000000 0.000000   0
## 5 0.999991 0.000000   0 0.000000 0.000000   0
## 6 0.999991 0.000000   0 0.000000 0.000000   0

Merge with pops

# Add an index column to Q_tibble
k19run4$index <- seq_len(nrow(k19run4))

# Perform the merge as before
df1 <-
  merge(
    k19run4,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind v1 v2 v3 v4     v5       v6 v7 v8 v9      v10 v11 v12      v13
## 411 OKI 1001  0  0  0  0 0.2643 0.000000  0  0  0 0.044959   0   0 0.078525
## 412 OKI 1002  0  0  0  0 0.0000 0.066895  0  0  0 0.000000   0   0 0.000000
## 413 OKI 1003  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 414 OKI 1004  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 415 OKI 1005  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
## 416 OKI 1006  0  0  0  0 0.0000 0.000000  0  0  0 0.000000   0   0 0.000000
##          v14      v15 v16      v17      v18 v19 Latitude Longitude Pop_City
## 411 0.612210 0.000000   0 0.000000 0.000000   0  26.5013  127.9454  Okinawa
## 412 0.449903 0.219687   0 0.172857 0.090652   0  26.5013  127.9454  Okinawa
## 413 0.999991 0.000000   0 0.000000 0.000000   0  26.5013  127.9454  Okinawa
## 414 0.999991 0.000000   0 0.000000 0.000000   0  26.5013  127.9454  Okinawa
## 415 0.999991 0.000000   0 0.000000 0.000000   0  26.5013  127.9454  Okinawa
## 416 0.999991 0.000000   0 0.000000 0.000000   0  26.5013  127.9454  Okinawa
##     Country    Region Subregion  Year order
## 411   Japan East Asia           2018?    54
## 412   Japan East Asia           2018?    54
## 413   Japan East Asia           2018?    54
## 414   Japan East Asia           2018?    54
## 415   Japan East Asia           2018?    54
## 416   Japan East Asia           2018?    54

4.2 Q-values

Need a color palette with at least 19 colors (for K=19) I’m using the same palette from the PCA

color_palette_19 <-
  c(
    "#AE9333",
    "#FFB347",
    "#77DD77",
    "#779ECB",
    "#1E90FF",
    "#B22222",
    "#FF8C1A",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green"
  )

Make pie plot

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "scripts", "RMarkdowns", "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_19) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
  my_theme()

ggsave(
  here("output", "euro_global", "figures", "fastStructure", "fastStructure_euroglobal_r1_k19_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

SNP Set 3 (MAF 1%) plots

library(LEA)
library(vcfR)
library(RColorBrewer)
library(adegenet)
#library(ape)
library(tidyverse)
library(here)
library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
#library(scales)
library(stringr)
library(ggtext)

5. LEA SNP Set 3 (MAF 1%) for euro_global dataset

Change to select matrix from best LEA run

leak22 <- read_delim(
  here("euro_global/output/snps_sets/r2_0.01_b.snmf/K22/run5/r2_0.01_b_r5.22.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(leak22)

## # A tibble: 6 × 22
##         X1      X2      X3      X4    X5      X6      X7      X8      X9     X10
##      <dbl>   <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1  1.13e-1 9.99e-5 9.99e-5 9.99e-5 0.494 5.05e-2 9.99e-5 1.79e-3 3.43e-2 5.14e-2
## 2  4.86e-2 1.00e-4 1.03e-2 2.78e-3 0.319 1.03e-3 3.56e-2 8.74e-3 6.26e-2 6.63e-2
## 3  9.98e-5 9.98e-5 9.98e-5 9.98e-5 0.980 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## 4  9.99e-5 9.99e-5 5.88e-3 3.40e-2 0.817 9.99e-5 1.80e-2 9.99e-5 5.80e-2 2.02e-2
## 5  9.99e-5 9.99e-5 9.99e-5 3.84e-3 0.844 5.71e-3 1.21e-2 9.99e-5 6.06e-2 6.65e-3
## 6  9.98e-5 9.98e-5 9.98e-5 9.98e-5 0.998 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## # ℹ 12 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>

The fam file

fam_file <- here(
  "euro_global/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Change ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

leak22 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak22)

head(leak22)

##    ind pop          X1          X2          X3          X4       X5          X6
## 1 1001 OKI 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779 5.05374e-02
## 2 1002 OKI 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323 1.02586e-03
## 3 1003 OKI 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697 9.98380e-05
## 4 1004 OKI 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342 9.98920e-05
## 5 1005 OKI 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145 5.71364e-03
## 6 1006 OKI 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904 9.98108e-05
##            X7          X8          X9         X10         X11         X12
## 1 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02 2.66775e-02
## 2 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02 3.31449e-02
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02 9.98920e-05
## 5 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##           X13         X14         X15         X16         X17         X18
## 1 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02 9.00488e-03
## 2 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05 4.38346e-03
## 4 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03 9.98920e-05
## 5 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##           X19         X20         X21         X22
## 1 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05
## 2 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05
## 4 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05
## 5 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05

Rename the columns

# Rename the columns starting from the third one
leak22 <- leak22 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak22)

##    ind pop          v1          v2          v3          v4       v5          v6
## 1 1001 OKI 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779 5.05374e-02
## 2 1002 OKI 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323 1.02586e-03
## 3 1003 OKI 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697 9.98380e-05
## 4 1004 OKI 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342 9.98920e-05
## 5 1005 OKI 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145 5.71364e-03
## 6 1006 OKI 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904 9.98108e-05
##            v7          v8          v9         v10         v11         v12
## 1 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02 2.66775e-02
## 2 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02 3.31449e-02
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02 9.98920e-05
## 5 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##           v13         v14         v15         v16         v17         v18
## 1 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02 9.00488e-03
## 2 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05 4.38346e-03
## 4 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03 9.98920e-05
## 5 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##           v19         v20         v21         v22
## 1 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05
## 2 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05
## 4 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05
## 5 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05

Import samples attributes

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)

selected<-subset (sampling_loc, !is.na(sampling_loc[,10]))

pops <- selected |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

pops <- pops |> filter(!is.na(order))

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

Merge with pops

# Add an index column to Q_tibble
leak22$index <- seq_len(nrow(leak22))

# Perform the merge as before
df1 <-
  merge(
    leak22,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind          v1          v2          v3          v4       v5
## 411 OKI 1001 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779
## 412 OKI 1002 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323
## 413 OKI 1003 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697
## 414 OKI 1004 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342
## 415 OKI 1005 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145
## 416 OKI 1006 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904
##              v6          v7          v8          v9         v10         v11
## 411 5.05374e-02 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02
## 412 1.02586e-03 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.98920e-05 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02
## 415 5.71364e-03 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##             v12         v13         v14         v15         v16         v17
## 411 2.66775e-02 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02
## 412 3.31449e-02 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05
## 414 9.98920e-05 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03
## 415 9.98830e-05 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##             v18         v19         v20         v21         v22 Latitude
## 411 9.00488e-03 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05  26.5013
## 412 9.99550e-05 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05  26.5013
## 413 4.38346e-03 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05  26.5013
## 414 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05  26.5013
## 415 9.98830e-05 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05  26.5013
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05  26.5013
##     Longitude Pop_City Country    Region Subregion  Year order
## 411  127.9454  Okinawa   Japan East Asia           2018?    54
## 412  127.9454  Okinawa   Japan East Asia           2018?    54
## 413  127.9454  Okinawa   Japan East Asia           2018?    54
## 414  127.9454  Okinawa   Japan East Asia           2018?    54
## 415  127.9454  Okinawa   Japan East Asia           2018?    54
## 416  127.9454  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

Load libraries for pie maps

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

Create color palette

color_palette_22 <-
  c(
    "#FF8C1A",
    "#1E90FF",
    "#008080",
    "#B22222",
    "orangered2",
    "green",
    "#FFFF99",
    "navy", 
    "coral",
    "#B20CC9",
    "goldenrod3",
    "#77DD77",
    "#AE9393",
    "#F49AC2",
    "magenta",
    "green4",
    "#75FAFF",
    "blue",    
    "yellow2",
    "purple4",
    "purple",
    "chocolate4"
        )

5.1 Make pie plot showing all continents

sf_use_s2(FALSE)

## Spherical geometry (s2) switched off

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20", "v21", "v22"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_22) +
  guides(fill = "none") +  # Hide legend
#  coord_sf() +
  coord_sf(xlim = c(-90, 145), ylim = c(-30, 60)) +
  my_theme()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "MAF_1", "maps", "lea_MAF1_k22_euro_global_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.2 Make pie plots just for Europe

df1 <- df1 |>
filter(
    Region == "Eastern Europe" | Region == "Southern Europe" | Region == "Western Europe"
  )

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20", "v21", "v22"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_22) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-13, 50), ylim = c(31, 53)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "MAF_1", "maps", "lea_MAF1_k22_euro_global_EUROPE_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf)

5.3 Make pie plot showing just Asia

leak22$index <- seq_len(nrow(leak22))

# Perform the merge as before
df1 <-
  merge(
    leak22,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind          v1          v2          v3          v4       v5
## 411 OKI 1001 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779
## 412 OKI 1002 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323
## 413 OKI 1003 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697
## 414 OKI 1004 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342
## 415 OKI 1005 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145
## 416 OKI 1006 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904
##              v6          v7          v8          v9         v10         v11
## 411 5.05374e-02 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02
## 412 1.02586e-03 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.98920e-05 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02
## 415 5.71364e-03 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##             v12         v13         v14         v15         v16         v17
## 411 2.66775e-02 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02
## 412 3.31449e-02 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05
## 414 9.98920e-05 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03
## 415 9.98830e-05 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##             v18         v19         v20         v21         v22 Latitude
## 411 9.00488e-03 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05  26.5013
## 412 9.99550e-05 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05  26.5013
## 413 4.38346e-03 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05  26.5013
## 414 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05  26.5013
## 415 9.98830e-05 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05  26.5013
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05  26.5013
##     Longitude Pop_City Country    Region Subregion  Year order
## 411  127.9454  Okinawa   Japan East Asia           2018?    54
## 412  127.9454  Okinawa   Japan East Asia           2018?    54
## 413  127.9454  Okinawa   Japan East Asia           2018?    54
## 414  127.9454  Okinawa   Japan East Asia           2018?    54
## 415  127.9454  Okinawa   Japan East Asia           2018?    54
## 416  127.9454  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20", "v21", "v22"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_22) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(60, 150), ylim = c(-10, 55)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "MAF_1", "maps", "lea_MAF1_k22_euro_global_ASIA_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Filter out America from the matrix

leak22$index <- seq_len(nrow(leak22))

# Perform the merge as before
df1 <-
  merge(
    leak22,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind          v1          v2          v3          v4       v5
## 411 OKI 1001 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779
## 412 OKI 1002 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323
## 413 OKI 1003 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697
## 414 OKI 1004 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342
## 415 OKI 1005 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145
## 416 OKI 1006 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904
##              v6          v7          v8          v9         v10         v11
## 411 5.05374e-02 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02
## 412 1.02586e-03 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.98920e-05 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02
## 415 5.71364e-03 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##             v12         v13         v14         v15         v16         v17
## 411 2.66775e-02 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02
## 412 3.31449e-02 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05
## 414 9.98920e-05 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03
## 415 9.98830e-05 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
##             v18         v19         v20         v21         v22 Latitude
## 411 9.00488e-03 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05  26.5013
## 412 9.99550e-05 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05  26.5013
## 413 4.38346e-03 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05  26.5013
## 414 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05  26.5013
## 415 9.98830e-05 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05  26.5013
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05  26.5013
##     Longitude Pop_City Country    Region Subregion  Year order
## 411  127.9454  Okinawa   Japan East Asia           2018?    54
## 412  127.9454  Okinawa   Japan East Asia           2018?    54
## 413  127.9454  Okinawa   Japan East Asia           2018?    54
## 414  127.9454  Okinawa   Japan East Asia           2018?    54
## 415  127.9454  Okinawa   Japan East Asia           2018?    54
## 416  127.9454  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

df1 <- df1 |>
filter(
    Region == "North America" | Region == "South America"
  )

5.3.1 Make pie plot showing just Americas

world <- ne_countries(scale = "medium", returnclass = "sf")

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns", 
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 2.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19","v20", "v21", "v22"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_22) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-120, -22), ylim = c(-30, 55)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "MAF_1", "maps", "lea_MAF1_k22_euro_global_Americas_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6. K=7 for global SNP Set 3

genotype <- here(
   "euro_global/output/snps_sets/r2_0.01_b.vcf"
  )

d <- read.vcfR(
  genotype
)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 8
##   header_line: 9
##   variant count: 22642
##   column count: 697
## 
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 22642
##   Character matrix gt cols: 697
##   skip: 0
##   nrows: 22642
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant: 22642
## All variants processed

Get population and individuals information

inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1])))) 
table(pops)

## pops
## ALD ALU ALV ARM BAR BEN BER BRE BUL CAM CES CHA CRO DES FRS GEL GES GRA GRC GRV 
##  10  12  12  10  12  12  12  13  10  12  14  12  12  16  12   2  12  11  10  12 
## HAI HAN HOC HUN IMP INJ INW ITB ITP ITR JAF KAC KAG KAN KAT KER KLP KRA KUN LAM 
##  12   4   7  12   4  11   4   5   9  12   2   6  12  11   6  12   4  12   4   9 
## MAL MAT OKI PAL POL POP QNC RAR REC ROM ROS SER SEV SIC SLO SOC SON SPB SPC SPM 
##  12  12  12  11   2  12  11  12  11   4  11   4  12   9  12  12   3   8   6   5 
## SPS SSK STS SUF SUU TAI TIK TIR TRE TUA TUH UTS YUN 
##   8  12  12   6   6   7  12   4  12   9  12  12   9

pops <- factor(pops)
inds <- unname(sapply(a, FUN = function(x) return(as.character(x[2]))))

project = load.snmfProject("euro_global/output/snps_sets/r2_0.01_b.snmfProject")

summary(project)

## $repetitions
##                       K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy        5     5     5     5     5     5     5     5     5
## without cross-entropy     0     0     0     0     0     0     0     0     0
## total                     5     5     5     5     5     5     5     5     5
##                       K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy         5      5      5      5      5      5      5      5
## without cross-entropy      0      0      0      0      0      0      0      0
## total                      5      5      5      5      5      5      5      5
##                       K = 18 K = 19 K = 20 K = 21 K = 22 K = 23 K = 24 K = 25
## with cross-entropy         5      5      5      5      5      5      5      5
## without cross-entropy      0      0      0      0      0      0      0      0
## total                      5      5      5      5      5      5      5      5
## 
## $crossEntropy
##          K = 1     K = 2     K = 3     K = 4     K = 5     K = 6     K = 7
## min  0.9295938 0.9035762 0.8929160 0.8866176 0.8831455 0.8810623 0.8793035
## mean 0.9299463 0.9038992 0.8932721 0.8869377 0.8840167 0.8813716 0.8797496
## max  0.9303090 0.9042813 0.8935996 0.8872815 0.8859704 0.8818163 0.8802178
##          K = 8     K = 9    K = 10    K = 11    K = 12    K = 13    K = 14
## min  0.8777762 0.8754397 0.8737881 0.8724198 0.8709529 0.8703098 0.8702140
## mean 0.8779980 0.8760114 0.8743412 0.8732523 0.8716021 0.8712855 0.8703966
## max  0.8784509 0.8765061 0.8748110 0.8741085 0.8724610 0.8718638 0.8706670
##         K = 15    K = 16    K = 17    K = 18    K = 19    K = 20    K = 21
## min  0.8691706 0.8691407 0.8684490 0.8678589 0.8677646 0.8675384 0.8673978
## mean 0.8697189 0.8695057 0.8689370 0.8686954 0.8680425 0.8681742 0.8678156
## max  0.8701692 0.8700150 0.8695248 0.8697050 0.8682069 0.8685108 0.8686613
##         K = 22    K = 23    K = 24    K = 25
## min  0.8669903 0.8671792 0.8666401 0.8673425
## mean 0.8679212 0.8675363 0.8676546 0.8677506
## max  0.8688708 0.8679872 0.8685890 0.8681663

# get the cross-entropy of all runs for K = 7
ce = cross.entropy(project, K = 7)
ce #run 5 is best for k=7

##           K = 7
## run 1 0.8793121
## run 2 0.8800735
## run 3 0.8802178
## run 4 0.8798409
## run 5 0.8793035

change to correct matrix

leak7 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b.snmf/K7/run5/r2_0.01_b_r5.7.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(leak7)

## # A tibble: 6 × 7
##         X1       X2     X3      X4     X5     X6    X7
##      <dbl>    <dbl>  <dbl>   <dbl>  <dbl>  <dbl> <dbl>
## 1 0.0756   0.0293   0.0751 0.0622  0.0574 0.0800 0.620
## 2 0.0512   0.0620   0.0632 0.0807  0.0627 0.125  0.556
## 3 0.000100 0.000100 0.0497 0.0150  0.0934 0.0266 0.815
## 4 0.0159   0.00257  0.0664 0.0291  0.0366 0.0927 0.757
## 5 0.000100 0.000100 0.0483 0.0153  0.0423 0.0944 0.800
## 6 0.000100 0.000100 0.0239 0.00847 0.0909 0.0414 0.835

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Change ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

leak7 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak7)

head(leak7)

##    ind pop          X1          X2        X3         X4        X5        X6
## 1 1001 OKI 0.075587600 0.029332800 0.0750965 0.06223380 0.0574401 0.0799922
## 2 1002 OKI 0.051196200 0.062006400 0.0631862 0.08066190 0.0626840 0.1246690
## 3 1003 OKI 0.000099982 0.000099982 0.0496697 0.01503730 0.0933872 0.0266181
## 4 1004 OKI 0.015876700 0.002571750 0.0663977 0.02908000 0.0366296 0.0927472
## 5 1005 OKI 0.000099982 0.000099982 0.0482861 0.01532970 0.0423157 0.0943542
## 6 1006 OKI 0.000099982 0.000099982 0.0239108 0.00847465 0.0909052 0.0413913
##         X7
## 1 0.620317
## 2 0.555597
## 3 0.815088
## 4 0.756697
## 5 0.799514
## 6 0.835118

Rename the columns

# Rename the columns starting from the third one
leak7 <- leak7 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak7)

##    ind pop          v1          v2        v3         v4        v5        v6
## 1 1001 OKI 0.075587600 0.029332800 0.0750965 0.06223380 0.0574401 0.0799922
## 2 1002 OKI 0.051196200 0.062006400 0.0631862 0.08066190 0.0626840 0.1246690
## 3 1003 OKI 0.000099982 0.000099982 0.0496697 0.01503730 0.0933872 0.0266181
## 4 1004 OKI 0.015876700 0.002571750 0.0663977 0.02908000 0.0366296 0.0927472
## 5 1005 OKI 0.000099982 0.000099982 0.0482861 0.01532970 0.0423157 0.0943542
## 6 1006 OKI 0.000099982 0.000099982 0.0239108 0.00847465 0.0909052 0.0413913
##         v7
## 1 0.620317
## 2 0.555597
## 3 0.815088
## 4 0.756697
## 5 0.799514
## 6 0.835118

Import samples attributes for euro_global

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
   dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

pops <- pops |> filter(!is.na(order))

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

Merge with pops

# Add an index column to Q_tibble
leak7$index <- seq_len(nrow(leak7))

# Perform the merge as before
df1 <-
  merge(
    leak7,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind          v1          v2        v3         v4        v5        v6
## 411 OKI 1001 0.075587600 0.029332800 0.0750965 0.06223380 0.0574401 0.0799922
## 412 OKI 1002 0.051196200 0.062006400 0.0631862 0.08066190 0.0626840 0.1246690
## 413 OKI 1003 0.000099982 0.000099982 0.0496697 0.01503730 0.0933872 0.0266181
## 414 OKI 1004 0.015876700 0.002571750 0.0663977 0.02908000 0.0366296 0.0927472
## 415 OKI 1005 0.000099982 0.000099982 0.0482861 0.01532970 0.0423157 0.0943542
## 416 OKI 1006 0.000099982 0.000099982 0.0239108 0.00847465 0.0909052 0.0413913
##           v7 Latitude Longitude Pop_City Country    Region Subregion  Year
## 411 0.620317  26.5013  127.9454  Okinawa   Japan East Asia           2018?
## 412 0.555597  26.5013  127.9454  Okinawa   Japan East Asia           2018?
## 413 0.815088  26.5013  127.9454  Okinawa   Japan East Asia           2018?
## 414 0.756697  26.5013  127.9454  Okinawa   Japan East Asia           2018?
## 415 0.799514  26.5013  127.9454  Okinawa   Japan East Asia           2018?
## 416 0.835118  26.5013  127.9454  Okinawa   Japan East Asia           2018?
##     order
## 411    54
## 412    54
## 413    54
## 414    54
## 415    54
## 416    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

make a new palette with only 7 colors

color_palette_7 <-
  c(
    "#77DD37",     
    "#FFFF19",
    "#75FAFF",    
    "purple3",    
    "red",  
    "#1E90FF", 
    "#FF8C1A")

6.1 Pie plots for LEA, k=7 for SNP Set 3 (r2<0.01,MAF 1%)

6.1.1 Europe only

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "scripts", "RMarkdowns", "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_7) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k7_euro_global_pie_EUROPE.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.1.2 Pie plots showing all continents

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)


world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_7) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-20, 150), ylim = c(-30, 60)) +
  my_theme()

# #

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k7_euro_global_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.1.3 Pie plots showing just Asia

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_7) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(60, 150), ylim = c(-10, 55)) +
  my_theme()

# #

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k7_euro_global_pie_ASIA.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.2 K=6 for global SNP Set 3

library(LEA)
library(vcfR)
genotype <- here(
   "euro_global/output/snps_sets/r2_0.01_b.vcf"
  )

d <- read.vcfR(
  genotype
)

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 8
##   header_line: 9
##   variant count: 22642
##   column count: 697
## 
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 22642
##   Character matrix gt cols: 697
##   skip: 0
##   nrows: 22642
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant: 22642
## All variants processed

Get population and individuals information

inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1])))) 
table(pops)

## pops
## ALD ALU ALV ARM BAR BEN BER BRE BUL CAM CES CHA CRO DES FRS GEL GES GRA GRC GRV 
##  10  12  12  10  12  12  12  13  10  12  14  12  12  16  12   2  12  11  10  12 
## HAI HAN HOC HUN IMP INJ INW ITB ITP ITR JAF KAC KAG KAN KAT KER KLP KRA KUN LAM 
##  12   4   7  12   4  11   4   5   9  12   2   6  12  11   6  12   4  12   4   9 
## MAL MAT OKI PAL POL POP QNC RAR REC ROM ROS SER SEV SIC SLO SOC SON SPB SPC SPM 
##  12  12  12  11   2  12  11  12  11   4  11   4  12   9  12  12   3   8   6   5 
## SPS SSK STS SUF SUU TAI TIK TIR TRE TUA TUH UTS YUN 
##   8  12  12   6   6   7  12   4  12   9  12  12   9

pops <- factor(pops)
inds <- unname(sapply(a, FUN = function(x) return(as.character(x[2]))))

project = load.snmfProject("euro_global/output/snps_sets/r2_0.01_b.snmfProject")

summary(project)

## $repetitions
##                       K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy        5     5     5     5     5     5     5     5     5
## without cross-entropy     0     0     0     0     0     0     0     0     0
## total                     5     5     5     5     5     5     5     5     5
##                       K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy         5      5      5      5      5      5      5      5
## without cross-entropy      0      0      0      0      0      0      0      0
## total                      5      5      5      5      5      5      5      5
##                       K = 18 K = 19 K = 20 K = 21 K = 22 K = 23 K = 24 K = 25
## with cross-entropy         5      5      5      5      5      5      5      5
## without cross-entropy      0      0      0      0      0      0      0      0
## total                      5      5      5      5      5      5      5      5
## 
## $crossEntropy
##          K = 1     K = 2     K = 3     K = 4     K = 5     K = 6     K = 7
## min  0.9295938 0.9035762 0.8929160 0.8866176 0.8831455 0.8810623 0.8793035
## mean 0.9299463 0.9038992 0.8932721 0.8869377 0.8840167 0.8813716 0.8797496
## max  0.9303090 0.9042813 0.8935996 0.8872815 0.8859704 0.8818163 0.8802178
##          K = 8     K = 9    K = 10    K = 11    K = 12    K = 13    K = 14
## min  0.8777762 0.8754397 0.8737881 0.8724198 0.8709529 0.8703098 0.8702140
## mean 0.8779980 0.8760114 0.8743412 0.8732523 0.8716021 0.8712855 0.8703966
## max  0.8784509 0.8765061 0.8748110 0.8741085 0.8724610 0.8718638 0.8706670
##         K = 15    K = 16    K = 17    K = 18    K = 19    K = 20    K = 21
## min  0.8691706 0.8691407 0.8684490 0.8678589 0.8677646 0.8675384 0.8673978
## mean 0.8697189 0.8695057 0.8689370 0.8686954 0.8680425 0.8681742 0.8678156
## max  0.8701692 0.8700150 0.8695248 0.8697050 0.8682069 0.8685108 0.8686613
##         K = 22    K = 23    K = 24    K = 25
## min  0.8669903 0.8671792 0.8666401 0.8673425
## mean 0.8679212 0.8675363 0.8676546 0.8677506
## max  0.8688708 0.8679872 0.8685890 0.8681663

# get the cross-entropy of all runs for K = 6
ce = cross.entropy(project, K = 6)
ce #run 5 is best for k=6

##           K = 6
## run 1 0.8811239
## run 2 0.8814691
## run 3 0.8818163
## run 4 0.8813863
## run 5 0.8810623

change to correct matrix

leak6 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b.snmf/K6/run5/r2_0.01_b_r5.6.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(leak6)

## # A tibble: 6 × 6
##      X1    X2     X3    X4     X5       X6
##   <dbl> <dbl>  <dbl> <dbl>  <dbl>    <dbl>
## 1 0.174 0.172 0.150  0.350 0.131  0.0241  
## 2 0.205 0.197 0.170  0.293 0.121  0.0131  
## 3 0.160 0.156 0.123  0.400 0.117  0.0444  
## 4 0.199 0.204 0.119  0.389 0.0885 0.000100
## 5 0.180 0.218 0.113  0.404 0.0847 0.000100
## 6 0.203 0.187 0.0976 0.404 0.108  0.000100

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Change ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to matrix

leak6 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak6)

head(leak6)

##    ind pop       X1       X2        X3       X4        X5          X6
## 1 1001 OKI 0.173708 0.171848 0.1501010 0.349694 0.1305770 0.024071800
## 2 1002 OKI 0.205148 0.197109 0.1701940 0.292993 0.1214940 0.013061600
## 3 1003 OKI 0.159720 0.156319 0.1226130 0.399926 0.1169840 0.044438900
## 4 1004 OKI 0.198631 0.204137 0.1192340 0.389391 0.0885066 0.000099991
## 5 1005 OKI 0.179729 0.218195 0.1134380 0.403849 0.0846894 0.000099991
## 6 1006 OKI 0.202770 0.187071 0.0975617 0.404205 0.1082920 0.000099991

Rename the columns

# Rename the columns starting from the third one
leak6 <- leak6 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak6)

##    ind pop       v1       v2        v3       v4        v5          v6
## 1 1001 OKI 0.173708 0.171848 0.1501010 0.349694 0.1305770 0.024071800
## 2 1002 OKI 0.205148 0.197109 0.1701940 0.292993 0.1214940 0.013061600
## 3 1003 OKI 0.159720 0.156319 0.1226130 0.399926 0.1169840 0.044438900
## 4 1004 OKI 0.198631 0.204137 0.1192340 0.389391 0.0885066 0.000099991
## 5 1005 OKI 0.179729 0.218195 0.1134380 0.403849 0.0846894 0.000099991
## 6 1006 OKI 0.202770 0.187071 0.0975617 0.404205 0.1082920 0.000099991

Import samples attributes for euro_global

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
   dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

pops <- pops |> filter(!is.na(order))

head(pops)

##   Abbreviation Latitude Longitude     Pop_City Country        Region Subregion
## 1          BER 39.79081  -74.9291   Berlin, NJ     USA North America          
## 2          COL 39.97170  -82.9071 Columbus, OH     USA North America          
## 3          PAL 26.70560  -80.0364   Palm Beach     USA North America          
## 4          HOU 29.75491  -95.3505  Houston, TX     USA North America          
## 5          LOS 34.05220 -118.2437  Los Angeles     USA North America          
## 6          MAU -3.09161  -60.0325   Manaus, AM  Brazil South America          
##   Year order
## 1 2018     1
## 2 2015     2
## 3 2018     3
## 4 2018     4
## 5 2018     5
## 6 2017     6

Merge with pops

# Add an index column to Q_tibble
leak6$index <- seq_len(nrow(leak6))

# Perform the merge as before
df1 <-
  merge(
    leak6,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1       v2        v3       v4        v5          v6
## 411 OKI 1001 0.173708 0.171848 0.1501010 0.349694 0.1305770 0.024071800
## 412 OKI 1002 0.205148 0.197109 0.1701940 0.292993 0.1214940 0.013061600
## 413 OKI 1003 0.159720 0.156319 0.1226130 0.399926 0.1169840 0.044438900
## 414 OKI 1004 0.198631 0.204137 0.1192340 0.389391 0.0885066 0.000099991
## 415 OKI 1005 0.179729 0.218195 0.1134380 0.403849 0.0846894 0.000099991
## 416 OKI 1006 0.202770 0.187071 0.0975617 0.404205 0.1082920 0.000099991
##     Latitude Longitude Pop_City Country    Region Subregion  Year order
## 411  26.5013  127.9454  Okinawa   Japan East Asia           2018?    54
## 412  26.5013  127.9454  Okinawa   Japan East Asia           2018?    54
## 413  26.5013  127.9454  Okinawa   Japan East Asia           2018?    54
## 414  26.5013  127.9454  Okinawa   Japan East Asia           2018?    54
## 415  26.5013  127.9454  Okinawa   Japan East Asia           2018?    54
## 416  26.5013  127.9454  Okinawa   Japan East Asia           2018?    54

df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'

make a new palette with only 6 colors

color_palette_6 <-
  c(
    "#FFFF19",   
    "#1E90FF", 
    "#77DD37",    
    "#FF8C1A",
    "purple3",      
    "red")

6.2.1 Pie plots for LEA, K=6 for SNP Set 3(r2<0.01, MAF 1%)

6.2.1.1 Europe only

Make pie plot

library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "scripts", "RMarkdowns", "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_6) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k6_euro_global_pie_EUROPE.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.2.1.2 Pie plots showing all continents

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)


world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_6) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-20, 150), ylim = c(-30, 60)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k6_euro_global_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.2.1.3 Pie plots showing just Asia

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)


world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_6) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(60, 150), ylim = c(-10, 46)) +
  my_theme()

# #

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k6_euro_global_pie_ASIA.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.2.1.4 Pie plots showing all continents including Americas

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_6) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-100, 150), ylim = c(-40, 60)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k6_euro_global_pie_whole.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.2.1.5 Pie plots showing Americas

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("scripts", "RMarkdowns",
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = world, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 2.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.1, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette_6) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-100, -25), ylim = c(-40, 60)) +
  my_theme()

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "lea", "maps", "lea_r1_k6_euro_global_pie_americas.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Euro_global_maps_interpolation

Margaret Corley

2023-12-04

Plots of structure output for euro_global dataset

1. Load the standard packages

2. K=20 LEA maps for SNP Set 2 (r2<0.1) for Europe global dataset

2.1 Pie plots for LEA using SNP Set 2 (r2<0.1)

2.1.1 Make pie plot showing all continents

2.1.2 Make pie plots just for Europe

2.1.2 Make pie plot showing just Asia

2.1.3 Make pie plot showing just Americas

3. K=5 LEA maps for SNP Set 2 (r1<0.1) for Europe global dataset

3.1 Pie plots for LEA, k=5, SNP Set 2 (r2<0.1)

3.1.1 Pie plots showing all continents

3.1 2 Pie plots for Europe

3.1.3 Pie plots for just Asia

3.1.4 Pie plots for just Americas

4. K=19 fastStructure simple maps for SNP Set 2 (r2<0.1) for Europe global dataset

4.1 Import samples attributes

4.1.1 Import the Q matrix (K19 for fastStructure)

4.2 Q-values

SNP Set 3 (MAF 1%) plots

5. LEA SNP Set 3 (MAF 1%) for euro_global dataset

5.1 Make pie plot showing all continents

5.2 Make pie plots just for Europe

5.3 Make pie plot showing just Asia

5.3.1 Make pie plot showing just Americas

6. K=7 for global SNP Set 3

6.1 Pie plots for LEA, k=7 for SNP Set 3 (r2<0.01,MAF 1%)

6.1.1 Europe only

6.1.2 Pie plots showing all continents

6.1.3 Pie plots showing just Asia

6.2 K=6 for global SNP Set 3

6.2.1 Pie plots for LEA, K=6 for SNP Set 3(r2<0.01, MAF 1%)

6.2.1.1 Europe only

6.2.1.2 Pie plots showing all continents

6.2.1.3 Pie plots showing just Asia

6.2.1.4 Pie plots showing all continents including Americas

6.2.1.5 Pie plots showing Americas