Load libraries
1. Overlap - STUCTURE results for 24 pops overlapping with SNPs (N=637)
2. Overlap - STUCTURE results for 24 pops overlapping with SNPs (N=637) for LOCPRIOR run
- 2.1 Best K = 3
  - 2.1.1 Extract ancestry coefficients for k=3
- 2.2 Map for K=3
  - 2.2.1 Pie plots

Load libraries

library(tidyverse)          # many helpful things
library(here)               # to load data easily
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
library(extrafont)          # probably won't work on openOnDemand on the clusters
library(hrbrthemes)         # not really needed, personal preference for plots
library(officer)            # export office format
library(here)

1. Overlap - STUCTURE results for 24 pops overlapping with SNPs (N=637)

1.1 Best K = 3

Using ggplot2 for individual admixtures #### 1.1.1 Extract ancestry coefficients for k=3

leak3 <- read_delim(
  here("output/microsats/STRUCTURE/overlap_nopops_run_13_Qmatrix.csv"),
  col_names = FALSE,
  show_col_types = FALSE)

head(leak3)

## # A tibble: 6 × 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1 0.985 0.008 0.007
## 2 0.844 0.017 0.138
## 3 0.02  0.014 0.965
## 4 0.568 0.163 0.269
## 5 0.961 0.01  0.03 
## 6 0.973 0.01  0.017

The population information

fam_file <- here("output/microsats/STRUCTURE/fam_file_data.txt")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID
## 1      FRS       FRMH01
## 2      FRS       FRMH02
## 3      FRS       FRMH03
## 4      FRS       FRMH04
## 5      FRS       FRMH05
## 6      FRS       FRMH06

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##      ind pop
## 1 FRMH01 FRS
## 2 FRMH02 FRS
## 3 FRMH03 FRS
## 4 FRMH04 FRS
## 5 FRMH05 FRS
## 6 FRMH06 FRS

Add it to the matrix

leak3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak3)

head(leak3)

##      ind pop    X1    X2    X3
## 1 FRMH01 FRS 0.985 0.008 0.007
## 2 FRMH02 FRS 0.844 0.017 0.138
## 3 FRMH03 FRS 0.020 0.014 0.965
## 4 FRMH04 FRS 0.568 0.163 0.269
## 5 FRMH05 FRS 0.961 0.010 0.030
## 6 FRMH06 FRS 0.973 0.010 0.017

Rename the columns

# Rename the columns starting from the third one
leak3 <- leak3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak3)

##      ind pop    v1    v2    v3
## 1 FRMH01 FRS 0.985 0.008 0.007
## 2 FRMH02 FRS 0.844 0.017 0.138
## 3 FRMH03 FRS 0.020 0.014 0.965
## 4 FRMH04 FRS 0.568 0.163 0.269
## 5 FRMH05 FRS 0.961 0.010 0.030
## 6 FRMH06 FRS 0.973 0.010 0.017

Import Sample Locations

sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
head(sampling_loc)

## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

source(
  here("my_theme3.R")
)

# Melt the data frame for plotting
Q_melted <- leak3 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
  c(
    "#77DD77",
    "yellow2",
    "#FF8C1A"
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:3)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n STRUCTURE inference with 11 microsats for 24 locations.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("STRUCTURE/overlap/results/STRUCTURE_plot_k=3_overlap_microsats_run13.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

1.2 K = 10 (for comparison to SNP’s best K)

Using ggplot2 for individual admixtures #### 1.2.1 Extract ancestry coefficients for k=10

leak10 <- read_delim(
  here("output/microsats/STRUCTURE/overlap_nopops_run_82_Qmatrix.csv"),
  col_names = FALSE,
  show_col_types = FALSE)

head(leak10)

## # A tibble: 6 × 10
##      X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.932 0.011 0.004 0.005 0.005 0.016 0.012 0.004 0.005 0.005
## 2 0.199 0.012 0.004 0.007 0.04  0.618 0.049 0.006 0.007 0.058
## 3 0.046 0.011 0.012 0.031 0.071 0.046 0.017 0.017 0.009 0.741
## 4 0.034 0.007 0.007 0.007 0.897 0.013 0.005 0.011 0.014 0.006
## 5 0.538 0.031 0.008 0.011 0.043 0.004 0.013 0.005 0.003 0.344
## 6 0.894 0.008 0.009 0.026 0.012 0.021 0.005 0.009 0.01  0.005

The population information

fam_file <- here("output/microsats/STRUCTURE/fam_file_data.txt")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID
## 1      FRS       FRMH01
## 2      FRS       FRMH02
## 3      FRS       FRMH03
## 4      FRS       FRMH04
## 5      FRS       FRMH05
## 6      FRS       FRMH06

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##      ind pop
## 1 FRMH01 FRS
## 2 FRMH02 FRS
## 3 FRMH03 FRS
## 4 FRMH04 FRS
## 5 FRMH05 FRS
## 6 FRMH06 FRS

Add it to the matrix

leak10 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak10)

head(leak10)

##      ind pop    X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1 FRMH01 FRS 0.932 0.011 0.004 0.005 0.005 0.016 0.012 0.004 0.005 0.005
## 2 FRMH02 FRS 0.199 0.012 0.004 0.007 0.040 0.618 0.049 0.006 0.007 0.058
## 3 FRMH03 FRS 0.046 0.011 0.012 0.031 0.071 0.046 0.017 0.017 0.009 0.741
## 4 FRMH04 FRS 0.034 0.007 0.007 0.007 0.897 0.013 0.005 0.011 0.014 0.006
## 5 FRMH05 FRS 0.538 0.031 0.008 0.011 0.043 0.004 0.013 0.005 0.003 0.344
## 6 FRMH06 FRS 0.894 0.008 0.009 0.026 0.012 0.021 0.005 0.009 0.010 0.005

Rename the columns

# Rename the columns starting from the third one
leak10 <- leak10 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak10)

##      ind pop    v1    v2    v3    v4    v5    v6    v7    v8    v9   v10
## 1 FRMH01 FRS 0.932 0.011 0.004 0.005 0.005 0.016 0.012 0.004 0.005 0.005
## 2 FRMH02 FRS 0.199 0.012 0.004 0.007 0.040 0.618 0.049 0.006 0.007 0.058
## 3 FRMH03 FRS 0.046 0.011 0.012 0.031 0.071 0.046 0.017 0.017 0.009 0.741
## 4 FRMH04 FRS 0.034 0.007 0.007 0.007 0.897 0.013 0.005 0.011 0.014 0.006
## 5 FRMH05 FRS 0.538 0.031 0.008 0.011 0.043 0.004 0.013 0.005 0.003 0.344
## 6 FRMH06 FRS 0.894 0.008 0.009 0.026 0.012 0.021 0.005 0.009 0.010 0.005

Import Sample Locations

sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/microsats/STRUCTURE/sampling_loc_euro_microsats.rds"))
head(sampling_loc)

##        Pop_City    Country   Latitude Longitude Continent Abbreviation Year
## 1         Limbe   Cameroon   4.023252  9.194771    Africa          LIB 2018
## 2     Morondava Madagascar -20.284200 44.279400    Africa          MAD 2016
## 3 Trois-Bassins    Reunion -21.109008 55.319206    Africa          TRO 2017
## 4       Dauguet  Mauritius -20.185300 57.521540    Africa          DAU 2022
## 5   Franceville      Gabon  -1.592070 13.532420    Africa          GAB 2015
## 6  Antananarivo Madagascar -18.879200 47.507900    Africa          ANT 2022
##           Region    Subregion order order2 orderold order_microsat microsats
## 1    West Africa       Africa    79     73       71             NA          
## 2    East Africa  East Africa    80     78       72             NA          
## 3   Indian Ocean Indian Ocean    81     81       73             NA          
## 4   Indian Ocean Indian Ocean    82     80       74             NA          
## 5 Central Africa       Africa    NA     72       NA             NA          
## 6    East Africa  East Africa    NA     76       NA             NA          
##   microsat_code alt_code  X  Country.1
## 1                        NA   Cameroon
## 2                        NA Madagascar
## 3                        NA    Reunion
## 4                        NA  Mauritius
## 5                        NA      Gabon
## 6                        NA Madagascar

source(
  here("my_theme3.R")
)

# Melt the data frame for plotting
Q_melted <- leak10 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
#  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order_microsat, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
 c(
   "darkgreen",   
   "#1E90FF", 
   "cyan",
   "yellow2",
   "#F49AC2",
   "#77DD77",
   "magenta",   
   "red3",      
   "#FFB347",
   "#FF8C1A")


# Generate all potential variable names
all_variables <- paste0("v", 1:10)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=10.\n STRUCTURE inference with 11 microsats for 24 locations.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("STRUCTURE/overlap/results/STRUCTURE_plot_k=10_overlap_microsats_run13.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

1.3 Map for K=3

color_palette <-
  c("#FF8C1A",
    "yellow2",
    "#77DD77")

1.1 Pie plots for all pops

1.1.1. Import samples attributes

#data<-read.csv("sampling_loc_euro_only_microsats.csv", stringsAsFactors = TRUE) 
#write_rds(data, "sampling_loc_euro_only_microsats.rds")

sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Year, order)

head(pops)

## # A tibble: 6 × 8
##   Abbreviation Latitude Longitude Pop_City            Country Region  Year order
##   <chr>           <dbl>     <dbl> <chr>               <chr>   <chr>  <dbl> <dbl>
## 1 FRS              45.2     5.77  Saint-Martin-d'Her… France  Weste…  2019     9
## 2 STS              48.6     7.75  Strasbourg          France  Weste…  2019    10
## 3 POP              41.2    -8.33  Penafiel            Portug… South…  2017    11
## 4 SPB              38.9    -6.97  Badajoz             Spain   South…  2018    13
## 5 SPS              36.2    -5.37  San Roque           Spain   South…  2017    14
## 6 SPC              39.4    -0.396 Catarroja           Spain   South…  2017    15

Merge with pops

# Add an index column to Q_tibble
leak3$index <- seq_len(nrow(leak3))

# Perform the merge as before
df1 <-
  merge(
    leak3,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop    ind    v1    v2    v3 Latitude Longitude             Pop_City
## 117 FRS FRMH01 0.985 0.008 0.007 45.16531  5.771806 Saint-Martin-d'Heres
## 118 FRS FRMH02 0.844 0.017 0.138 45.16531  5.771806 Saint-Martin-d'Heres
## 119 FRS FRMH03 0.020 0.014 0.965 45.16531  5.771806 Saint-Martin-d'Heres
## 120 FRS FRMH04 0.568 0.163 0.269 45.16531  5.771806 Saint-Martin-d'Heres
## 121 FRS FRMH05 0.961 0.010 0.030 45.16531  5.771806 Saint-Martin-d'Heres
## 122 FRS FRMH06 0.973 0.010 0.017 45.16531  5.771806 Saint-Martin-d'Heres
##     Country         Region Year order
## 117  France Western Europe 2019     9
## 118  France Western Europe 2019     9
## 119  France Western Europe 2019     9
## 120  France Western Europe 2019     9
## 121  France Western Europe 2019     9
## 122  France Western Europe 2019     9

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)

## Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE

library(rnaturalearth)
library(rnaturalearthdata)

## 
## Attaching package: 'rnaturalearthdata'

## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("my_theme2.R")
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave(
  here("STRUCTURE/overlap/results/Structure_k3_microsats_pie_overlap_allcountries.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("my_theme2.R")
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3"), 
                  color = NA) +
  scale_fill_manual(values = color_palette) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

ggsave(
  here("STRUCTURE/overlap/results/Structure_k3_microsats_pie_overlap_nolabs.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2. Overlap - STUCTURE results for 24 pops overlapping with SNPs (N=637) for LOCPRIOR run

2.1 Best K = 3

Using ggplot2 for individual admixtures

2.1.1 Extract ancestry coefficients for k=3

leak3 <- read_delim(
  here("output/microsats/STRUCTURE/overlap_microsats_usepoploc_run_13_Qmatrix.csv"),
  col_names = FALSE,
  show_col_types = FALSE)

head(leak3)

## # A tibble: 6 × 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1 0.994 0.001 0.005
## 2 0.896 0.003 0.101
## 3 0.235 0.003 0.763
## 4 0.598 0.031 0.37 
## 5 0.965 0.001 0.034
## 6 0.985 0.002 0.013

The population information

fam_file <- here("output/microsats/STRUCTURE/overlap_loc_prior/fam_file_data.txt")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID
## 1      FRS       FRMH01
## 2      FRS       FRMH02
## 3      FRS       FRMH03
## 4      FRS       FRMH04
## 5      FRS       FRMH05
## 6      FRS       FRMH06

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##      ind pop
## 1 FRMH01 FRS
## 2 FRMH02 FRS
## 3 FRMH03 FRS
## 4 FRMH04 FRS
## 5 FRMH05 FRS
## 6 FRMH06 FRS

Add it to the matrix

leak3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak3)

head(leak3)

##      ind pop    X1    X2    X3
## 1 FRMH01 FRS 0.994 0.001 0.005
## 2 FRMH02 FRS 0.896 0.003 0.101
## 3 FRMH03 FRS 0.235 0.003 0.763
## 4 FRMH04 FRS 0.598 0.031 0.370
## 5 FRMH05 FRS 0.965 0.001 0.034
## 6 FRMH06 FRS 0.985 0.002 0.013

Rename the columns

# Rename the columns starting from the third one
leak3 <- leak3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak3)

##      ind pop    v1    v2    v3
## 1 FRMH01 FRS 0.994 0.001 0.005
## 2 FRMH02 FRS 0.896 0.003 0.101
## 3 FRMH03 FRS 0.235 0.003 0.763
## 4 FRMH04 FRS 0.598 0.031 0.370
## 5 FRMH05 FRS 0.965 0.001 0.034
## 6 FRMH06 FRS 0.985 0.002 0.013

Import Sample Locations

sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
head(sampling_loc)

## # A tibble: 6 × 10
##   Pop_City Country Latitude Longitude Continent Abbreviation  Year Region Marker
##   <chr>    <chr>      <dbl>     <dbl> <chr>     <chr>        <dbl> <chr>  <chr> 
## 1 Saint-M… France      45.2     5.77  Europe    FRS           2019 Weste… SNPs  
## 2 Strasbo… France      48.6     7.75  Europe    STS           2019 Weste… SNPs  
## 3 Penafiel Portug…     41.2    -8.33  Europe    POP           2017 South… SNPs  
## 4 Badajoz  Spain       38.9    -6.97  Europe    SPB           2018 South… SNPs  
## 5 San Roq… Spain       36.2    -5.37  Europe    SPS           2017 South… SNPs  
## 6 Catarro… Spain       39.4    -0.396 Europe    SPC           2017 South… SNPs  
## # ℹ 1 more variable: order <dbl>

source(
  here("my_theme3.R")
)

# Melt the data frame for plotting
Q_melted <- leak3 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
  c(
    "yellow2",
    "#FF8C1A",
    "#77DD77"
  )

# Generate all potential variable names
all_variables <- paste0("v", 1:3)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n STRUCTURE inference using population data with 11 microsats for 24 locations.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("STRUCTURE/overlap_loc_prior/results/STRUCTURE_locprior_k=3_overlap_microsats_run13.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.2 Map for K=3

color_palette <-
  c(   "#FF8C1A",
       "yellow2",
     "#77DD77")

2.2.1 Pie plots

Import samples attributes

#data<-read.csv("sampling_loc_euro_only_microsats.csv", stringsAsFactors = TRUE) 
#write_rds(data, "sampling_loc_euro_only_microsats.rds")

sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Year, order)

head(pops)

## # A tibble: 6 × 8
##   Abbreviation Latitude Longitude Pop_City            Country Region  Year order
##   <chr>           <dbl>     <dbl> <chr>               <chr>   <chr>  <dbl> <dbl>
## 1 FRS              45.2     5.77  Saint-Martin-d'Her… France  Weste…  2019     9
## 2 STS              48.6     7.75  Strasbourg          France  Weste…  2019    10
## 3 POP              41.2    -8.33  Penafiel            Portug… South…  2017    11
## 4 SPB              38.9    -6.97  Badajoz             Spain   South…  2018    13
## 5 SPS              36.2    -5.37  San Roque           Spain   South…  2017    14
## 6 SPC              39.4    -0.396 Catarroja           Spain   South…  2017    15

Merge with pops

# Add an index column to Q_tibble
leak3$index <- seq_len(nrow(leak3))

# Perform the merge as before
df1 <-
  merge(
    leak3,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop    ind    v1    v2    v3 Latitude Longitude             Pop_City
## 117 FRS FRMH01 0.994 0.001 0.005 45.16531  5.771806 Saint-Martin-d'Heres
## 118 FRS FRMH02 0.896 0.003 0.101 45.16531  5.771806 Saint-Martin-d'Heres
## 119 FRS FRMH03 0.235 0.003 0.763 45.16531  5.771806 Saint-Martin-d'Heres
## 120 FRS FRMH04 0.598 0.031 0.370 45.16531  5.771806 Saint-Martin-d'Heres
## 121 FRS FRMH05 0.965 0.001 0.034 45.16531  5.771806 Saint-Martin-d'Heres
## 122 FRS FRMH06 0.985 0.002 0.013 45.16531  5.771806 Saint-Martin-d'Heres
##     Country         Region Year order
## 117  France Western Europe 2019     9
## 118  France Western Europe 2019     9
## 119  France Western Europe 2019     9
## 120  France Western Europe 2019     9
## 121  France Western Europe 2019     9
## 122  France Western Europe 2019     9

# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here("my_theme2.R")
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = color_palette) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

ggsave(
  here("STRUCTURE/overlap_loc_prior/results/Structure_k3_microsats_pie_overlap_LOCPOP_allcountries.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

microsats_figures_manuscript

Margaret Corley

2024-09-18

Load libraries

1. Overlap - STUCTURE results for 24 pops overlapping with SNPs (N=637)

1.1 Best K = 3

1.2 K = 10 (for comparison to SNP’s best K)

1.3 Map for K=3

1.1 Pie plots for all pops

1.1.1. Import samples attributes

2. Overlap - STUCTURE results for 24 pops overlapping with SNPs (N=637) for LOCPRIOR run

2.1 Best K = 3

2.1.1 Extract ancestry coefficients for k=3

2.2 Map for K=3

2.2.1 Pie plots