1. Load Packages
2. Sample locations
fastStructure
3. Set 2: r2<0.1 dataset from fastStructure simple
- 3.1 Structure plots for fastStructure data (r2<0.1)
- 3.2 Pie-charts for r2<0.1 fastStructure
  - 3.2.1. Import the Q matrix (K15 for fastStructure)
  - 3.2.2 Q-values
4. Set 1: r2<0.01 dataset from fastStructure simple
- 4.1 Structure plots for fastStructure data (r2<0.01)
- 4.2 Pie-charts for r2<0.01 fastStructure
  - 4.2.1 Import the Q matrix (K15 for fastStructure)
  - 4.2.2 Q-values
5. Set 3: MAF 1% (r2<0.01) dataset from fastStructure simple
Admixture
6. Set 2: r2<0.1 dataset from admixture
- 6.1 Admixture r2<0.1 structure plots
7. Set 1: r2<0.01 dataset from admixture
- 7.1 Admixture r2<0.01 structure plots
8. Set 3: MAF 1% (r2<0.01) dataset from admixture
- 8.1 Admixture MAF 1% r2<0.01 structure plots
- 8.2 Pie-charts for MAF 1% r2<0.01 admixture (Set 3)

1. Load Packages

Load the standard packages

#install.packages(c("raster", "sf", "tidyverse", "fields", "downloader"))

library(raster)  # important to load before tidyverse, otherwise it masks select()
library(tidyverse)
library(scatterpie)
library(sf)
library(ggspatial)
library(ggplot2)
library(dplyr)
library(colorout)
library(here)
library(extrafont)
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)
library(ggrepel)
library(Cairo)

2. Sample locations

Import samples attributes

#data<-read.csv("sampling_loc_all.csv", stringsAsFactors = TRUE) 
#write_rds(data, "sampling_loc_all.rds")

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     9
## 2 West Europe 2019    10
## 3 West Europe 2017    11
## 4 West Europe 2017    12
## 5 West Europe 2018    13
## 6 West Europe 2017    14

fastStructure

3. Set 2: r2<0.1 dataset from fastStructure simple

3.1 Structure plots for fastStructure data (r2<0.1)

3.1.1 k=15

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run73 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run073/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k15run73)

## # A tibble: 6 × 15
##         X1       X2      X3    X4    X5      X6     X7      X8    X9   X10   X11
##      <dbl>    <dbl>   <dbl> <dbl> <dbl>   <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 0.000001 0.000001 1.59e-1  1e-6  1e-6 1   e-6 0.841  1   e-6  1e-6  1e-6  1e-6
## 2 0.000001 0.000001 1   e-6  1e-6  1e-6 1   e-6 1.00   1   e-6  1e-6  1e-6  1e-6
## 3 0.000001 0.000001 9.24e-1  1e-6  1e-6 1   e-6 0.0757 1   e-6  1e-6  1e-6  1e-6
## 4 0.000001 0.000001 3.35e-3  1e-6  1e-6 1   e-6 0.996  7.92e-4  1e-6  1e-6  1e-6
## 5 0.000001 0.000001 5.06e-3  1e-6  1e-6 1   e-6 0.995  1   e-6  1e-6  1e-6  1e-6
## 6 0.0783   0.000001 1.20e-1  1e-6  1e-6 9.94e-3 0.791  1   e-6  1e-6  1e-6  1e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run73 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run73)

head(k15run73)

##    ind pop       X1    X2       X3    X4    X5       X6       X7       X8    X9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
##     X10   X11   X12   X13   X14   X15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06

Rename the columns

# Rename the columns starting from the third one
k15run73 <- k15run73 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run73)

##    ind pop       v1    v2       v3    v4    v5       v6       v7       v8    v9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
##     v10   v11   v12   v13   v14   v15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06

source(
  here(
    "my_theme3.R"
  )
)

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k15run73 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "blue",  
    "#1E90FF",
    "chocolate4",   
    "green",     
    "#75FAFF",    
    "#B20CD9",     
    "purple4",  
    "#F49AC2",    
    "yellow2",
    "#FFFF99",  
    "#B22222",
    "#FFB347",    
    "#77DD77",
    "#FF8C1A",  
    "purple"    
    )

 # Generate all potential variable names
all_variables <- paste0("v", 1:15)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n FastStructure for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=15_europe_r2_1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1.2 k=13

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run11 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run011/simple.13.meanQ"
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k13run11)

## # A tibble: 6 × 13
##      X1     X2    X3    X4    X5    X6    X7    X8    X9     X10     X11     X12
##   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.209   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 7.91e-1 1   e-6 1   e-6
## 2 0.741   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 2.59e-1 1   e-6 1   e-6
## 3 1.00    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6 1   e-6 1   e-6
## 4 0.250   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 7.50e-1 1   e-6 1   e-6
## 5 0.175   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 8.06e-1 1   e-6 1.87e-2
## 6 0.354   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 6.30e-1 1.60e-2 1   e-6
## # ℹ 1 more variable: X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run11 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run11)

head(k13run11)

##    ind pop       X1    X2    X3    X4    X5    X6    X7    X8    X9      X10
## 1 1065 SOC 0.208893 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.791098
## 2 1066 SOC 0.741139 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.258852
## 3 1067 SOC 0.999990 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001
## 4 1068 SOC 0.249965 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.750026
## 5 1069 SOC 0.175186 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.806057
## 6 1070 SOC 0.354239 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.629777
##        X11      X12   X13
## 1 0.000001 0.000001 1e-06
## 2 0.000001 0.000001 1e-06
## 3 0.000001 0.000001 1e-06
## 4 0.000001 0.000001 1e-06
## 5 0.000001 0.018748 1e-06
## 6 0.015976 0.000001 1e-06

Rename the columns

# Rename the columns starting from the third one
k13run11 <- k13run11 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run11)

##    ind pop       v1    v2    v3    v4    v5    v6    v7    v8    v9      v10
## 1 1065 SOC 0.208893 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.791098
## 2 1066 SOC 0.741139 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.258852
## 3 1067 SOC 0.999990 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001
## 4 1068 SOC 0.249965 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.750026
## 5 1069 SOC 0.175186 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.806057
## 6 1070 SOC 0.354239 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.629777
##        v11      v12   v13
## 1 0.000001 0.000001 1e-06
## 2 0.000001 0.000001 1e-06
## 3 0.000001 0.000001 1e-06
## 4 0.000001 0.000001 1e-06
## 5 0.000001 0.018748 1e-06
## 6 0.015976 0.000001 1e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k13run11 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "#77DD77",
    "yellow2",
    "#75FAFF",        
    "#B22222",
    "chocolate4",
    "#008080",
    "green",  
    "purple",       
    "purple4",  
    "#FF8C1A",  
    "blue",  
    "#F49AC2",       
    "#1E90FF"
    )

 # Generate all potential variable names
all_variables <- paste0("v", 1:13)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n FastStructure for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=13_europe_r2_1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1.3 k=18

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k18run22 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run022/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k18run22)

## # A tibble: 6 × 18
##       X1    X2      X3    X4    X5    X6      X7    X8    X9     X10   X11   X12
##    <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1   1e-6  1e-6 1.01e-2  1e-6  1e-6  1e-6 5.39e-1  1e-6  1e-6 1   e-6  1e-6  1e-6
## 2   1e-6  1e-6 1   e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 3   1e-6  1e-6 1   e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 3.15e-1  1e-6  1e-6
## 4   1e-6  1e-6 1   e-6  1e-6  1e-6  1e-6 4.04e-1  1e-6  1e-6 1   e-6  1e-6  1e-6
## 5   1e-6  1e-6 1   e-6  1e-6  1e-6  1e-6 5.64e-1  1e-6  1e-6 9.88e-2  1e-6  1e-6
## 6   1e-6  1e-6 6.38e-2  1e-6  1e-6  1e-6 5.22e-1  1e-6  1e-6 1.22e-1  1e-6  1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run22 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run22)

head(k18run22)

##    ind pop    X1    X2       X3    X4    X5    X6       X7    X8    X9      X10
## 1 1065 SOC 1e-06 1e-06 0.010103 1e-06 1e-06 1e-06 0.538619 1e-06 1e-06 0.000001
## 2 1066 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001
## 3 1067 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.314543
## 4 1068 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.404441 1e-06 1e-06 0.000001
## 5 1069 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.564054 1e-06 1e-06 0.098750
## 6 1070 SOC 1e-06 1e-06 0.063819 1e-06 1e-06 1e-06 0.522484 1e-06 1e-06 0.121619
##     X11   X12   X13   X14      X15   X16      X17   X18
## 1 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.451269 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.999990 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.685447 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 0.025585 1e-06 0.569965 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.337187 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.292070 1e-06

Rename the columns

# Rename the columns starting from the third one
k18run22 <- k18run22 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k18run22)

##    ind pop    v1    v2       v3    v4    v5    v6       v7    v8    v9      v10
## 1 1065 SOC 1e-06 1e-06 0.010103 1e-06 1e-06 1e-06 0.538619 1e-06 1e-06 0.000001
## 2 1066 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001
## 3 1067 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.314543
## 4 1068 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.404441 1e-06 1e-06 0.000001
## 5 1069 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.564054 1e-06 1e-06 0.098750
## 6 1070 SOC 1e-06 1e-06 0.063819 1e-06 1e-06 1e-06 0.522484 1e-06 1e-06 0.121619
##     v11   v12   v13   v14      v15   v16      v17   v18
## 1 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.451269 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.999990 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.685447 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 0.025585 1e-06 0.569965 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.337187 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.292070 1e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k18run22 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "#FF8C1A",       
    "blue",         
    "#008080",     
    "#77DD77",     
    "purple",
    "#F49AC2",  
    "green",    
    "#B22222",      
    "chocolate4",
    "#B20CD9",     
    "green4",
    "purple4",       
    "#FFFF99",      
    "#75FAFF",   
    "orchid",    
    "#1E90FF",
    "#FFB347",
    "yellow"      
    )
  

 # Generate all potential variable names
all_variables <- paste0("v", 1:18)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n FastStructure for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=18_europe_r2_1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1.4 k=5

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k5run4)

## # A tibble: 6 × 5
##         X1       X2    X3       X4       X5
##      <dbl>    <dbl> <dbl>    <dbl>    <dbl>
## 1 0.0165   0.000002 0.984 0.000002 0.000002
## 2 0.000002 0.000002 1.00  0.000002 0.000002
## 3 0.000002 0.000002 1.00  0.000002 0.000002
## 4 0.000002 0.000002 0.954 0.0463   0.000002
## 5 0.000002 0.000002 0.933 0.0674   0.000002
## 6 0.000002 0.000002 0.919 0.0813   0.000002

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run4)

head(k5run4)

##    ind pop       X1    X2       X3       X4    X5
## 1 1065 SOC 0.016451 2e-06 0.983543 0.000002 2e-06
## 2 1066 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 3 1067 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 4 1068 SOC 0.000002 2e-06 0.953741 0.046252 2e-06
## 5 1069 SOC 0.000002 2e-06 0.932638 0.067356 2e-06
## 6 1070 SOC 0.000002 2e-06 0.918655 0.081339 2e-06

Rename the columns

# Rename the columns starting from the third one
k5run4 <- k5run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k5run4)

##    ind pop       v1    v2       v3       v4    v5
## 1 1065 SOC 0.016451 2e-06 0.983543 0.000002 2e-06
## 2 1066 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 3 1067 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 4 1068 SOC 0.000002 2e-06 0.953741 0.046252 2e-06
## 5 1069 SOC 0.000002 2e-06 0.932638 0.067356 2e-06
## 6 1070 SOC 0.000002 2e-06 0.918655 0.081339 2e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k5run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
     "#FFFF19",  
     "purple3",        
     "#FF8C1A",     
     "#1E90FF",
     "#77DD37"
     )
  

   # Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n FastStructure for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=5_europe_r2_1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.1.5 k=6

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k6run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run004/simple.6.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k6run4)

## # A tibble: 6 × 6
##         X1       X2    X3       X4       X5       X6
##      <dbl>    <dbl> <dbl>    <dbl>    <dbl>    <dbl>
## 1 0.000002 0.000002 0.978 0.000002 0.0219   0.000002
## 2 0.000002 0.000002 1.00  0.000002 0.000002 0.000002
## 3 0.000002 0.000002 1.00  0.000002 0.000002 0.000002
## 4 0.000002 0.000002 0.934 0.000002 0.0657   0.000002
## 5 0.000002 0.000002 0.919 0.000002 0.0808   0.000002
## 6 0.000002 0.000002 0.902 0.0112   0.0865   0.000002

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k6run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k6run4)

head(k6run4)

##    ind pop    X1    X2       X3       X4       X5    X6
## 1 1065 SOC 2e-06 2e-06 0.978077 0.000002 0.021916 2e-06
## 2 1066 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 3 1067 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 4 1068 SOC 2e-06 2e-06 0.934298 0.000002 0.065695 2e-06
## 5 1069 SOC 2e-06 2e-06 0.919204 0.000002 0.080789 2e-06
## 6 1070 SOC 2e-06 2e-06 0.902340 0.011192 0.086463 2e-06

Rename the columns

# Rename the columns starting from the third one
k6run4 <- k6run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k6run4)

##    ind pop    v1    v2       v3       v4       v5    v6
## 1 1065 SOC 2e-06 2e-06 0.978077 0.000002 0.021916 2e-06
## 2 1066 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 3 1067 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 4 1068 SOC 2e-06 2e-06 0.934298 0.000002 0.065695 2e-06
## 5 1069 SOC 2e-06 2e-06 0.919204 0.000002 0.080789 2e-06
## 6 1070 SOC 2e-06 2e-06 0.902340 0.011192 0.086463 2e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k6run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
      c(
    "purple3",
    "#1E90FF",
    "#FF8C1A",    
    "#77DD37", 
    "red",
    "#FFFF19"   
    )

   # Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n FastStructure for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=6_europe_r2_1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.2 Pie-charts for r2<0.1 fastStructure

3.2.1. Import the Q matrix (K15 for fastStructure)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run73 <- read_delim(
  here(
    "output",
    "europe",
    "admixture",
    "r_1_run073_simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k15run73 <- k15run73 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k15run73)

## # A tibble: 6 × 15
##         X1       X2      X3    X4    X5      X6     X7      X8    X9   X10   X11
##      <dbl>    <dbl>   <dbl> <dbl> <dbl>   <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 0.000001 0.000001 1.59e-1  1e-6  1e-6 1   e-6 0.841  1   e-6  1e-6  1e-6  1e-6
## 2 0.000001 0.000001 1   e-6  1e-6  1e-6 1   e-6 1.00   1   e-6  1e-6  1e-6  1e-6
## 3 0.000001 0.000001 9.24e-1  1e-6  1e-6 1   e-6 0.0757 1   e-6  1e-6  1e-6  1e-6
## 4 0.000001 0.000001 3.35e-3  1e-6  1e-6 1   e-6 0.996  7.92e-4  1e-6  1e-6  1e-6
## 5 0.000001 0.000001 5.06e-3  1e-6  1e-6 1   e-6 0.995  1   e-6  1e-6  1e-6  1e-6
## 6 0.0783   0.000001 1.20e-1  1e-6  1e-6 9.94e-3 0.791  1   e-6  1e-6  1e-6  1e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run73 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run73)

head(k15run73)

##    ind pop       X1    X2       X3    X4    X5       X6       X7       X8    X9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
##     X10   X11   X12   X13   X14   X15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06

Rename the columns

# Rename the columns starting from the third one
k15run73 <- k15run73 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run73)

##    ind pop       v1    v2       v3    v4    v5       v6       v7       v8    v9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
##     v10   v11   v12   v13   v14   v15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06

Merge with pops

# Add an index column to Q_tibble
k15run73$index <- seq_len(nrow(k15run73))

# Perform the merge as before
df1 <-
  merge(
    k15run73,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1    v2       v3    v4    v5       v6       v7       v8
## 310 SOC 1065 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001
## 311 SOC 1066 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001
## 312 SOC 1067 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001
## 313 SOC 1068 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792
## 314 SOC 1069 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001
## 315 SOC 1070 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001
##        v9   v10   v11   v12   v13   v14   v15 Latitude Longitude Pop_City
## 310 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042  39.74533    Sochi
## 311 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042  39.74533    Sochi
## 312 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042  39.74533    Sochi
## 313 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042  39.74533    Sochi
## 314 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042  39.74533    Sochi
## 315 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042  39.74533    Sochi
##     Country         Region   Subregion Year order
## 310  Russia Eastern Europe East Europe 2021    38
## 311  Russia Eastern Europe East Europe 2021    38
## 312  Russia Eastern Europe East Europe 2021    38
## 313  Russia Eastern Europe East Europe 2021    38
## 314  Russia Eastern Europe East Europe 2021    38
## 315  Russia Eastern Europe East Europe 2021    38

3.2.2 Q-values

make a palette with 15 colors

colors2 <-c(
      "v1" = "#FF8C1A",
      "v2" = "#FFFF99",
      "v3" = "purple4",
      "v4" = "#F49AC2",
      "v5" = "#77DD77",
      "v6" = "yellow2",
      "v7" = "purple",
      "v8" = "#B20CC9",
      "v9" = "chocolate4",
      "v10" = "blue",
      "v11" = "green",
      "v12" = "#FFB347",  
      "v13" = "#1E90FF",
      "v14" = "#75FAFF",
      "v15" = "#B22222"
      )
colors2

##           v1           v2           v3           v4           v5           v6 
##    "#FF8C1A"    "#FFFF99"    "purple4"    "#F49AC2"    "#77DD77"    "yellow2" 
##           v7           v8           v9          v10          v11          v12 
##     "purple"    "#B20CC9" "chocolate4"       "blue"      "green"    "#FFB347" 
##          v13          v14          v15 
##    "#1E90FF"    "#75FAFF"    "#B22222"

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world 
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
  my_theme()

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_r1_k15_pie_all_countries.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4. Set 1: r2<0.01 dataset from fastStructure simple

4.1 Structure plots for fastStructure data (r2<0.01)

4.1.1 k=15

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run09 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run009/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k15run09)

## # A tibble: 6 × 15
##         X1       X2     X3    X4      X5      X6    X7    X8      X9   X10   X11
##      <dbl>    <dbl>  <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 0.000002 0.351      2e-6  2e-6 9.94e-2 5.25e-1  2e-6  2e-6 2   e-6  2e-6  2e-6
## 2 0.000002 1.00       2e-6  2e-6 2   e-6 2   e-6  2e-6  2e-6 2   e-6  2e-6  2e-6
## 3 0.000002 0.000002   2e-6  2e-6 1.00e+0 2   e-6  2e-6  2e-6 2   e-6  2e-6  2e-6
## 4 0.000002 0.637      2e-6  2e-6 2   e-6 3.24e-1  2e-6  2e-6 2   e-6  2e-6  2e-6
## 5 0.000002 0.388      2e-6  2e-6 2   e-6 5.67e-1  2e-6  2e-6 2   e-6  2e-6  2e-6
## 6 0.000002 0.588      2e-6  2e-6 2.02e-1 2   e-6  2e-6  2e-6 1.59e-1  2e-6  2e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run09 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run09)

head(k15run09)

##    ind pop    X1       X2    X3    X4       X5       X6    X7    X8       X9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
##     X10   X11      X12   X13      X14      X15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002

Rename the columns

# Rename the columns starting from the third one
k15run09 <- k15run09 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run09)

##    ind pop    v1       v2    v3    v4       v5       v6    v7    v8       v9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
##     v10   v11      v12   v13      v14      v15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002

source(
  here(
    "my_theme3.R"
  )
)

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k15run09 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "purple4",       
    "#B20CD9",       
    "#B22222",          
    "purple",    
    "#75FAFF",    
    "#FFB347", 
    "#008080",      
    "yellow2",
    "blue", 
    "#F49AC2",    
    "#77DD77",
    "green",     
    "orchid",
    "#1E90FF",
    "#FF8C1A"    
    )
 
 # Generate all potential variable names
all_variables <- paste0("v", 1:15)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n FastStructure for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=15_europe_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4.1.2 k=13

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run41 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run041/simple.13.meanQ"
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k13run41)

## # A tibble: 6 × 13
##         X1       X2       X3      X4    X5    X6    X7    X8      X9   X10   X11
##      <dbl>    <dbl>    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 0.000002 0.0629   0.000002    2e-6  2e-6  2e-6  2e-6  2e-6 7.83e-1  2e-6  2e-6
## 2 0.000002 0.000002 0.000002    2e-6  2e-6  2e-6  2e-6  2e-6 8.81e-1  2e-6  2e-6
## 3 0.000002 0.000002 0.000002    2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6  2e-6
## 4 0.0409   0.000002 0.000002    2e-6  2e-6  2e-6  2e-6  2e-6 9.59e-1  2e-6  2e-6
## 5 0.000002 0.0855   0.000002    2e-6  2e-6  2e-6  2e-6  2e-6 9.14e-1  2e-6  2e-6
## 6 0.000002 0.122    0.000002    2e-6  2e-6  2e-6  2e-6  2e-6 7.37e-1  2e-6  2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run41 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run41)

head(k13run41)

##    ind pop       X1       X2    X3    X4    X5    X6    X7    X8       X9   X10
## 1 1065 SOC 0.000002 0.062911 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.782510 2e-06
## 2 1066 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.881126 2e-06
## 3 1067 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.040936 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.959039 2e-06
## 5 1069 SOC 0.000002 0.085520 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.914454 2e-06
## 6 1070 SOC 0.000002 0.121511 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.736631 2e-06
##     X11      X12   X13
## 1 2e-06 0.154556 2e-06
## 2 2e-06 0.118848 2e-06
## 3 2e-06 0.999972 2e-06
## 4 2e-06 0.000002 2e-06
## 5 2e-06 0.000002 2e-06
## 6 2e-06 0.141834 2e-06

Rename the columns

# Rename the columns starting from the third one
k13run41 <- k13run41 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run41)

##    ind pop       v1       v2    v3    v4    v5    v6    v7    v8       v9   v10
## 1 1065 SOC 0.000002 0.062911 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.782510 2e-06
## 2 1066 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.881126 2e-06
## 3 1067 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.040936 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.959039 2e-06
## 5 1069 SOC 0.000002 0.085520 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.914454 2e-06
## 6 1070 SOC 0.000002 0.121511 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.736631 2e-06
##     v11      v12   v13
## 1 2e-06 0.154556 2e-06
## 2 2e-06 0.118848 2e-06
## 3 2e-06 0.999972 2e-06
## 4 2e-06 0.000002 2e-06
## 5 2e-06 0.000002 2e-06
## 6 2e-06 0.141834 2e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k13run41 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "#FF8C1A",        
    "blue",  
    "#77DD77",    
    "#B20CD9",        
    "#B22222",    
    "purple4",  
    "green",  
    "#008080",    
    "chocolate4",
    "#F49AC2",   
    "#1E90FF",
    "purple",    
    "yellow2"
        )

 # Generate all potential variable names
all_variables <- paste0("v", 1:13)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n FastStructure for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=13_europe_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4.1.3 k=18

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k18run35 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run035/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k18run35)

## # A tibble: 6 × 18
##       X1    X2      X3    X4      X5    X6    X7    X8    X9   X10     X11   X12
##    <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1   2e-6  2e-6 2   e-6  2e-6 5.28e-1  2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6
## 2   2e-6  2e-6 2   e-6  2e-6 9.54e-1  2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6
## 3   2e-6  2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6
## 4   2e-6  2e-6 2   e-6  2e-6 6.98e-1  2e-6  2e-6  2e-6  2e-6  2e-6 3.75e-2  2e-6
## 5   2e-6  2e-6 6.37e-2  2e-6 4.69e-1  2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6
## 6   2e-6  2e-6 2   e-6  2e-6 7.11e-1  2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run35 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run35)

head(k18run35)

##    ind pop    X1    X2       X3    X4       X5    X6    X7    X8    X9   X10
## 1 1065 SOC 2e-06 2e-06 0.000002 2e-06 0.527816 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 2e-06 0.000002 2e-06 0.953589 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 2e-06 0.000002 2e-06 0.698323 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 2e-06 0.063682 2e-06 0.468962 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 2e-06 0.000002 2e-06 0.710877 2e-06 2e-06 2e-06 2e-06 2e-06
##        X11   X12   X13   X14   X15      X16   X17      X18
## 1 0.000002 2e-06 2e-06 2e-06 2e-06 0.110473 2e-06 0.361686
## 2 0.000002 2e-06 2e-06 2e-06 2e-06 0.046384 2e-06 0.000002
## 3 0.000002 2e-06 2e-06 2e-06 2e-06 0.999972 2e-06 0.000002
## 4 0.037455 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.264197
## 5 0.000002 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.467332
## 6 0.000002 2e-06 2e-06 2e-06 2e-06 0.289096 2e-06 0.000002

Rename the columns

# Rename the columns starting from the third one
k18run35 <- k18run35 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k18run35)

##    ind pop    v1    v2       v3    v4       v5    v6    v7    v8    v9   v10
## 1 1065 SOC 2e-06 2e-06 0.000002 2e-06 0.527816 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 2e-06 0.000002 2e-06 0.953589 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 2e-06 0.000002 2e-06 0.698323 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 2e-06 0.063682 2e-06 0.468962 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 2e-06 0.000002 2e-06 0.710877 2e-06 2e-06 2e-06 2e-06 2e-06
##        v11   v12   v13   v14   v15      v16   v17      v18
## 1 0.000002 2e-06 2e-06 2e-06 2e-06 0.110473 2e-06 0.361686
## 2 0.000002 2e-06 2e-06 2e-06 2e-06 0.046384 2e-06 0.000002
## 3 0.000002 2e-06 2e-06 2e-06 2e-06 0.999972 2e-06 0.000002
## 4 0.037455 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.264197
## 5 0.000002 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.467332
## 6 0.000002 2e-06 2e-06 2e-06 2e-06 0.289096 2e-06 0.000002

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k18run35 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "yellow2",     
    "#B20CD9",        
    "#B22222",       
    "#FFB347",
    "#F49AC2",      
    "#FFFF99",   
    "#75FAFF",  
    "blue", 
    "green4",    
    "green",   
    "#FF8C1A",  
    "#77DD77",   
    "#008080", 
    "orchid",    
    "goldenrod",
    "purple",
    "#1E90FF",
    "purple4"    
    )
  
   # Generate all potential variable names
all_variables <- paste0("v", 1:18)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n FastStructure for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=18_europe_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4.1.4 k=5

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k5run4)

## # A tibble: 6 × 5
##         X1       X2       X3       X4    X5
##      <dbl>    <dbl>    <dbl>    <dbl> <dbl>
## 1 0.000006 0.000006 0.0728   0.000006 0.927
## 2 0.000006 0.000006 0.000006 0.000006 1.00 
## 3 0.000006 0.000006 0.000006 0.000006 1.00 
## 4 0.120    0.000006 0.0220   0.000006 0.858
## 5 0.114    0.000006 0.0497   0.000006 0.837
## 6 0.0746   0.000006 0.0772   0.000006 0.848

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run4)

head(k5run4)

##    ind pop       X1    X2       X3    X4       X5
## 1 1065 SOC 0.000006 6e-06 0.072818 6e-06 0.927163
## 2 1066 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 3 1067 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 4 1068 SOC 0.119502 6e-06 0.022025 6e-06 0.858461
## 5 1069 SOC 0.113794 6e-06 0.049666 6e-06 0.836528
## 6 1070 SOC 0.074577 6e-06 0.077227 6e-06 0.848184

Rename the columns

# Rename the columns starting from the third one
k5run4 <- k5run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k5run4)

##    ind pop       v1    v2       v3    v4       v5
## 1 1065 SOC 0.000006 6e-06 0.072818 6e-06 0.927163
## 2 1066 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 3 1067 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 4 1068 SOC 0.119502 6e-06 0.022025 6e-06 0.858461
## 5 1069 SOC 0.113794 6e-06 0.049666 6e-06 0.836528
## 6 1070 SOC 0.074577 6e-06 0.077227 6e-06 0.848184

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k5run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
     "#1E90FF",
     "#FFFF19",  
     "purple3",    
     "#77DD37",     
     "#FF8C1A"    
     )
  
   # Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n FastStructure for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=5_europe_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4.1.5 k=6

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k6run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run004/simple.6.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k6run4)

## # A tibble: 6 × 6
##      X1       X2       X3       X4       X5       X6
##   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1 0.947 0.0533   0.000005 0.000005 0.000005 0.000005
## 2 1.00  0.000005 0.000005 0.000005 0.000005 0.000005
## 3 1.00  0.000005 0.000005 0.000005 0.000005 0.000005
## 4 0.903 0.0567   0.0407   0.000005 0.000005 0.000005
## 5 0.851 0.00334  0.0308   0.115    0.000005 0.000005
## 6 0.887 0.00293  0.110    0.000005 0.000005 0.000005

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k6run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k6run4)

head(k6run4)

##    ind pop       X1       X2       X3       X4    X5    X6
## 1 1065 SOC 0.946689 0.053291 0.000005 0.000005 5e-06 5e-06
## 2 1066 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 3 1067 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 4 1068 SOC 0.902567 0.056717 0.040701 0.000005 5e-06 5e-06
## 5 1069 SOC 0.850611 0.003335 0.030846 0.115197 5e-06 5e-06
## 6 1070 SOC 0.886852 0.002929 0.110204 0.000005 5e-06 5e-06

Rename the columns

# Rename the columns starting from the third one
k6run4 <- k6run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k6run4)

##    ind pop       v1       v2       v3       v4    v5    v6
## 1 1065 SOC 0.946689 0.053291 0.000005 0.000005 5e-06 5e-06
## 2 1066 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 3 1067 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 4 1068 SOC 0.902567 0.056717 0.040701 0.000005 5e-06 5e-06
## 5 1069 SOC 0.850611 0.003335 0.030846 0.115197 5e-06 5e-06
## 6 1070 SOC 0.886852 0.002929 0.110204 0.000005 5e-06 5e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k6run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
      c(
    "#77DD37", 
    "#FF8C1A",    
    "#1E90FF",    
    "#FFFF19",
    "purple3",
    "red"
    )
  
   # Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n FastStructure for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_plot_k=6_europe_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

4.2 Pie-charts for r2<0.01 fastStructure

4.2.1 Import the Q matrix (K15 for fastStructure)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run09 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run009/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k15run09 <- k15run09 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k15run09)

## # A tibble: 6 × 15
##         X1       X2     X3    X4      X5      X6    X7    X8      X9   X10   X11
##      <dbl>    <dbl>  <dbl> <dbl>   <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 0.000002 0.351      2e-6  2e-6 9.94e-2 5.25e-1  2e-6  2e-6 2   e-6  2e-6  2e-6
## 2 0.000002 1.00       2e-6  2e-6 2   e-6 2   e-6  2e-6  2e-6 2   e-6  2e-6  2e-6
## 3 0.000002 0.000002   2e-6  2e-6 1.00e+0 2   e-6  2e-6  2e-6 2   e-6  2e-6  2e-6
## 4 0.000002 0.637      2e-6  2e-6 2   e-6 3.24e-1  2e-6  2e-6 2   e-6  2e-6  2e-6
## 5 0.000002 0.388      2e-6  2e-6 2   e-6 5.67e-1  2e-6  2e-6 2   e-6  2e-6  2e-6
## 6 0.000002 0.588      2e-6  2e-6 2.02e-1 2   e-6  2e-6  2e-6 1.59e-1  2e-6  2e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run09 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run09)

head(k15run09)

##    ind pop    X1       X2    X3    X4       X5       X6    X7    X8       X9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
##     X10   X11      X12   X13      X14      X15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002

Rename the columns

# Rename the columns starting from the third one
k15run09 <- k15run09 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run09)

##    ind pop    v1       v2    v3    v4       v5       v6    v7    v8       v9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
##     v10   v11      v12   v13      v14      v15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002

Merge with pops

# Add an index column to Q_tibble
k15run09$index <- seq_len(nrow(k15run09))

# Perform the merge as before
df1 <-
  merge(
    k15run09,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1       v2    v3    v4       v5       v6    v7    v8       v9
## 310 SOC 1065 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 311 SOC 1066 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 312 SOC 1067 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 313 SOC 1068 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 314 SOC 1069 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 315 SOC 1070 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
##       v10   v11      v12   v13      v14      v15 Latitude Longitude Pop_City
## 310 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002 43.60042  39.74533    Sochi
## 311 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002 43.60042  39.74533    Sochi
## 312 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002 43.60042  39.74533    Sochi
## 313 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298 43.60042  39.74533    Sochi
## 314 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002 43.60042  39.74533    Sochi
## 315 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002 43.60042  39.74533    Sochi
##     Country         Region   Subregion Year order
## 310  Russia Eastern Europe East Europe 2021    38
## 311  Russia Eastern Europe East Europe 2021    38
## 312  Russia Eastern Europe East Europe 2021    38
## 313  Russia Eastern Europe East Europe 2021    38
## 314  Russia Eastern Europe East Europe 2021    38
## 315  Russia Eastern Europe East Europe 2021    38

4.2.2 Q-values

make a palette with 15 colors

colors2 <-c(
      "v1" = "#F49AC2",
      "v2" = "#B20CC9",
      "v3" = "#77DD77",
      "v4" = "green",
      "v5" = "purple4",
      "v6" = "purple",
      "v7" = "#1E90FF",
      "v8" = "#FFB347",  
      "v9" = "#FF8C1A",
      "v10" = "blue",
      "v11" = "#75FAFF", 
      "v12" = "yellow2", 
      "v13" = "#B22222",
      "v14" = "#008080",
      "v15" = "orchid"
      )
colors2

##        v1        v2        v3        v4        v5        v6        v7        v8 
## "#F49AC2" "#B20CC9" "#77DD77"   "green" "purple4"  "purple" "#1E90FF" "#FFB347" 
##        v9       v10       v11       v12       v13       v14       v15 
## "#FF8C1A"    "blue" "#75FAFF" "yellow2" "#B22222" "#008080"  "orchid"

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "fastStructure_r01_k15_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5. Set 3: MAF 1% (r2<0.01) dataset from fastStructure simple

5.1 Structure plots for fastStructure data (r2<0.01)

5.1.1 k=15

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run07 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run007/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k15run07)

## # A tibble: 6 × 15
##         X1    X2       X3     X4    X5    X6    X7    X8    X9   X10   X11   X12
##      <dbl> <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 2 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 3 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 4 0.000002 0.927 0.0732     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 5 0.000002 0.996 0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 6 0.000002 0.999 0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run07 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run07)

head(k15run07)

##    ind pop    X1       X2       X3    X4    X5    X6    X7    X8    X9   X10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##     X11   X12   X13   X14      X15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742

Rename the columns

# Rename the columns starting from the third one
k15run07 <- k15run07 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run07)

##    ind pop    v1       v2       v3    v4    v5    v6    v7    v8    v9   v10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##     v11   v12   v13   v14      v15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742

source(
  here(
    "my_theme3.R"
  )
)

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k15run07 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "green",  
    "green4",
    "#1E90FF",
    "#75FAFF", 
    "purple",       
    "#FF8C1A",  
    "#B22222",      
    "#FFB347", 
    "blue",     
    "#008080",   
    "chocolate4",    
    "purple4",          
    "#77DD77",
    "#F49AC2",    
    "yellow2"    
    )
  
  
 # Generate all potential variable names
all_variables <- paste0("v", 1:15)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n FastStructure for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_k=15_europe_MAF1_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.1.2 k=13

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run8 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run008/simple.13.meanQ"
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k13run8)

## # A tibble: 6 × 13
##         X1    X2      X3      X4    X5      X6    X7    X8      X9     X10   X11
##      <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>
## 1 0.546    0.443    2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 1.17e-2 2   e-6  2e-6
## 2 0.000002 1.00     2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 3 0.000002 1.00     2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 4 0.000002 0.843    2e-6 2   e-6  2e-6 1.57e-1  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 5 0.483    0.475    2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 4.20e-2 2   e-6  2e-6
## 6 0.313    0.577    2e-6 2.64e-2  2e-6 2   e-6  2e-6  2e-6 2   e-6 8.40e-2  2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"


# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run8 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run8)

head(k13run8)

##    ind pop       X1       X2    X3       X4    X5       X6    X7    X8       X9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
##        X10   X11   X12   X13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06

Rename the columns

# Rename the columns starting from the third one
k13run8 <- k13run8 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run8)

##    ind pop       v1       v2    v3       v4    v5       v6    v7    v8       v9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
##        v10   v11   v12   v13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k13run8 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "#1E90FF",
    "#008080", 
    "blue",     
    "#FF8C1A",     
    "chocolate4",        
    "yellow2",
    "purple4",     
    "purple",    
    "green",  
    "#B22222",      
    "#008080",    
    "#B20CD9",      
    "#F49AC2"
    )
  

 # Generate all potential variable names
all_variables <- paste0("v", 1:13)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n FastStructure for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_k=13_europe_MAF1_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.1.3 k=18

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k18run2 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run002/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k18run2)

## # A tibble: 6 × 18
##       X1    X2    X3    X4      X5    X6    X7      X8    X9   X10     X11   X12
##    <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1   1e-6  1e-6  1e-6  1e-6 2.77e-1  1e-6  1e-6 6.23e-1  1e-6  1e-6 1   e-6  1e-6
## 2   1e-6  1e-6  1e-6  1e-6 1.00e+0  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6
## 3   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6
## 4   1e-6  1e-6  1e-6  1e-6 5.60e-1  1e-6  1e-6 4.33e-1  1e-6  1e-6 6.86e-3  1e-6
## 5   1e-6  1e-6  1e-6  1e-6 2.69e-1  1e-6  1e-6 7.31e-1  1e-6  1e-6 1   e-6  1e-6
## 6   1e-6  1e-6  1e-6  1e-6 4.14e-1  1e-6  1e-6 3.85e-1  1e-6  1e-6 1   e-6  1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run2 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run2)

head(k18run2)

##    ind pop    X1    X2    X3    X4       X5    X6    X7       X8    X9   X10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##        X11   X12   X13   X14   X15      X16      X17   X18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06

Rename the columns

# Rename the columns starting from the third one
k18run2 <- k18run2 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k18run2)

##    ind pop    v1    v2    v3    v4       v5    v6    v7       v8    v9   v10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##        v11   v12   v13   v14   v15      v16      v17   v18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k18run2 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "green",         
    "#77DD77",  
    "blue", 
    "yellow2",     
    "#1E90FF",
    "green4",       
    "purple4",
    "#F49AC2",      
    "#FFB347",
    "orchid",  
    "purple",
    "chocolate4",
    "#B22222",  
    "#75FAFF",      
    "#FF8C1A",  
    "#B20CD9",   
    "#FFFF99",       
    "#008080"    
    )
  
  
   # Generate all potential variable names
all_variables <- paste0("v", 1:18)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n FastStructure for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_k=18_europe_MAF1_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.1.4 k=5

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k5run4)

## # A tibble: 6 × 5
##         X1       X2       X3    X4       X5
##      <dbl>    <dbl>    <dbl> <dbl>    <dbl>
## 1 0.000005 0.000005 0.0292   0.971 0.000005
## 2 0.000005 0.000005 0.000005 1.00  0.000005
## 3 0.000005 0.000005 0.000005 1.00  0.000005
## 4 0.000005 0.104    0.00251  0.894 0.000005
## 5 0.000005 0.101    0.000005 0.871 0.0275  
## 6 0.000005 0.0316   0.000005 0.889 0.0796

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run4)

head(k5run4)

##    ind pop    X1       X2       X3       X4       X5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650

Rename the columns

# Rename the columns starting from the third one
k5run4 <- k5run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k5run4)

##    ind pop    v1       v2       v3       v4       v5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k5run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
  c(     
     "#FFFF19",  
     "purple3",    
     "#1E90FF",
     "#FF8C1A",
     "#77DD37"
     )
  

   # Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n FastStructure for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_k=5_europe_MAF1_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.1.5 k=6

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k6run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.6.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k6run4)

## # A tibble: 6 × 6
##         X1       X2    X3       X4       X5       X6
##      <dbl>    <dbl> <dbl>    <dbl>    <dbl>    <dbl>
## 1 0.000004 0.000004 0.950 0.0503   0.000004 0.000004
## 2 0.000004 0.000004 1.00  0.000004 0.000004 0.000004
## 3 0.000004 0.000004 1.00  0.000004 0.000004 0.000004
## 4 0.0835   0.000004 0.897 0.0196   0.000075 0.000004
## 5 0.000004 0.000004 0.927 0.0728   0.000004 0.000004
## 6 0.0653   0.000004 0.878 0.0569   0.000004 0.000004

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k6run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k6run4)

head(k6run4)

##    ind pop       X1    X2       X3       X4      X5    X6
## 1 1065 SOC 0.000004 4e-06 0.949654 0.050330 4.0e-06 4e-06
## 2 1066 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 3 1067 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 4 1068 SOC 0.083468 4e-06 0.896836 0.019613 7.5e-05 4e-06
## 5 1069 SOC 0.000004 4e-06 0.927134 0.072850 4.0e-06 4e-06
## 6 1070 SOC 0.065259 4e-06 0.877866 0.056863 4.0e-06 4e-06

Rename the columns

# Rename the columns starting from the third one
k6run4 <- k6run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k6run4)

##    ind pop       v1    v2       v3       v4      v5    v6
## 1 1065 SOC 0.000004 4e-06 0.949654 0.050330 4.0e-06 4e-06
## 2 1066 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 3 1067 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 4 1068 SOC 0.083468 4e-06 0.896836 0.019613 7.5e-05 4e-06
## 5 1069 SOC 0.000004 4e-06 0.927134 0.072850 4.0e-06 4e-06
## 6 1070 SOC 0.065259 4e-06 0.877866 0.056863 4.0e-06 4e-06

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k6run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
    "purple3",
    "#FF8C1A",    
    "#FFFF19",    
    "#1E90FF",      
    "red",    
    "#77DD37"
    )

  
   # Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n FastStructure for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_k=6_europe_MAF1_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.1.6 k=20

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k20run13 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run013/simple.20.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

head(k20run13)

## # A tibble: 6 × 20
##       X1      X2    X3    X4    X5    X6    X7    X8    X9   X10     X11     X12
##    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>
## 1   1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6 1.88e-1
## 2   1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6 2.36e-4
## 3   1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6 1.00e+0
## 4   1e-6 1.10e-3  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 2.98e-2 1   e-6
## 5   1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6 1   e-6
## 6   1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6 1.49e-1
## # ℹ 8 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k20run13 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k20run13)

head(k20run13)

##    ind pop    X1       X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1 1065 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 1e-06 0.001097 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
##        X11      X12   X13      X14   X15   X16      X17   X18   X19   X20
## 1 0.000001 0.188189 1e-06 0.811789 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 2 0.000001 0.000236 1e-06 0.999742 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 3 0.000001 0.999977 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 4 0.029811 0.000001 1e-06 0.950145 1e-06 1e-06 0.018928 1e-06 1e-06 1e-06
## 5 0.000001 0.000001 1e-06 0.951559 1e-06 1e-06 0.048419 1e-06 1e-06 1e-06
## 6 0.000001 0.149064 1e-06 0.479014 1e-06 1e-06 0.371902 1e-06 1e-06 1e-06

Rename the columns

# Rename the columns starting from the third one
k20run13 <- k20run13 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k20run13)

##    ind pop    v1       v2    v3    v4    v5    v6    v7    v8    v9   v10
## 1 1065 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 1e-06 0.001097 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
##        v11      v12   v13      v14   v15   v16      v17   v18   v19   v20
## 1 0.000001 0.188189 1e-06 0.811789 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 2 0.000001 0.000236 1e-06 0.999742 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 3 0.000001 0.999977 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 4 0.029811 0.000001 1e-06 0.950145 1e-06 1e-06 0.018928 1e-06 1e-06 1e-06
## 5 0.000001 0.000001 1e-06 0.951559 1e-06 1e-06 0.048419 1e-06 1e-06 1e-06
## 6 0.000001 0.149064 1e-06 0.479014 1e-06 1e-06 0.371902 1e-06 1e-06 1e-06

source(
  here(
    "my_theme3.R"
  )
)

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

Melt data frame for plotting

# Melt the data frame for plotting
Q_melted <- k20run13 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
    "purple",
    "yellow2",      
    "#B20CD9",     
    "#FFB347",
    "#F49AC2",
    "#B22222",    
    "orchid",    
    "goldenrod3",    
    "gray",
    "#1E90FF",      
    "green4", 
    "#FFFF99",
    "green",
    "#008080",
    "#75FAFF",
    "chocolate4",
    "#77DD77",
    "purple4",
    "#FF8C1A",
    "blue")
  
 
 # Generate all potential variable names
all_variables <- paste0("v", 1:20)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n FastStructure for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_k=20_europe_MAF1_r2_01.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.2 Pie-charts for MAF 1% r2<0.01 fastStructure

5.2.1 k=15 map

Import the Q matrix (K15 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run07 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run007/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k15run07 <- k15run07 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k15run07)

## # A tibble: 6 × 15
##         X1    X2       X3     X4    X5    X6    X7    X8    X9   X10   X11   X12
##      <dbl> <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 2 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 3 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 4 0.000002 0.927 0.0732     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 5 0.000002 0.996 0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 6 0.000002 0.999 0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run07 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run07)

head(k15run07)

##    ind pop    X1       X2       X3    X4    X5    X6    X7    X8    X9   X10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##     X11   X12   X13   X14      X15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742

Rename the columns

# Rename the columns starting from the third one
k15run07 <- k15run07 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run07)

##    ind pop    v1       v2       v3    v4    v5    v6    v7    v8    v9   v10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##     v11   v12   v13   v14      v15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742

Merge with pops

# Add an index column to Q_tibble
k15run07$index <- seq_len(nrow(k15run07))

# Perform the merge as before
df1 <-
  merge(
    k15run07,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1       v2       v3    v4    v5    v6    v7    v8    v9   v10
## 310 SOC 1065 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 311 SOC 1066 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 312 SOC 1067 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 313 SOC 1068 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 314 SOC 1069 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 315 SOC 1070 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##       v11   v12   v13   v14      v15 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 311 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 312 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 313 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 314 2e-06 2e-06 2e-06 2e-06 0.004162 43.60042  39.74533    Sochi  Russia
## 315 2e-06 2e-06 2e-06 2e-06 0.000742 43.60042  39.74533    Sochi  Russia
##             Region   Subregion Year order
## 310 Eastern Europe East Europe 2021    38
## 311 Eastern Europe East Europe 2021    38
## 312 Eastern Europe East Europe 2021    38
## 313 Eastern Europe East Europe 2021    38
## 314 Eastern Europe East Europe 2021    38
## 315 Eastern Europe East Europe 2021    38

Q-values for k=15

make a palette with 15 colors

colors2 <-c(
      "v1" = "#F49AC2",
      "v2" = "purple",
      "v3" = "#FF8C1A",
      "v4" = "#77DD77",
      "v5" = "#1E90FF", 
      "v6" = "blue",
      "v7" = "green",
      "v8" = "yellow2",  
      "v9" = "#008080",
      "v10" = "#B22222",
      "v11" = "#75FAFF", 
      "v12" = "chocolate4",
      "v13" = "purple4",
      "v14" = "green4",
      "v15" = "#FFB347"
      )
colors2

##           v1           v2           v3           v4           v5           v6 
##    "#F49AC2"     "purple"    "#FF8C1A"    "#77DD77"    "#1E90FF"       "blue" 
##           v7           v8           v9          v10          v11          v12 
##      "green"    "yellow2"    "#008080"    "#B22222"    "#75FAFF" "chocolate4" 
##          v13          v14          v15 
##    "purple4"     "green4"    "#FFB347"

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k15_pie_all_countries_zoom.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k15_pie_all_countries_outlines.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k15_pie_all_countries_no_labs.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.2.2 k=5 map

Import the Q matrix (K5 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k5run4 <- k5run4 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k5run4)

## # A tibble: 6 × 5
##         X1       X2       X3    X4       X5
##      <dbl>    <dbl>    <dbl> <dbl>    <dbl>
## 1 0.000005 0.000005 0.0292   0.971 0.000005
## 2 0.000005 0.000005 0.000005 1.00  0.000005
## 3 0.000005 0.000005 0.000005 1.00  0.000005
## 4 0.000005 0.104    0.00251  0.894 0.000005
## 5 0.000005 0.101    0.000005 0.871 0.0275  
## 6 0.000005 0.0316   0.000005 0.889 0.0796

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run4)

head(k5run4)

##    ind pop    X1       X2       X3       X4       X5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650

Rename the columns

# Rename the columns starting from the third one
k5run4 <- k5run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k5run4)

##    ind pop    v1       v2       v3       v4       v5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650

Merge with pops

# Add an index column to Q_tibble
k5run4$index <- seq_len(nrow(k5run4))

# Perform the merge as before
df1 <-
  merge(
    k5run4,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1       v2       v3       v4       v5 Latitude Longitude
## 310 SOC 1065 5e-06 0.000005 0.029175 0.970810 0.000005 43.60042  39.74533
## 311 SOC 1066 5e-06 0.000005 0.000005 0.999980 0.000005 43.60042  39.74533
## 312 SOC 1067 5e-06 0.000005 0.000005 0.999980 0.000005 43.60042  39.74533
## 313 SOC 1068 5e-06 0.103530 0.002506 0.893954 0.000005 43.60042  39.74533
## 314 SOC 1069 5e-06 0.101074 0.000005 0.871372 0.027544 43.60042  39.74533
## 315 SOC 1070 5e-06 0.031613 0.000005 0.888727 0.079650 43.60042  39.74533
##     Pop_City Country         Region   Subregion Year order
## 310    Sochi  Russia Eastern Europe East Europe 2021    38
## 311    Sochi  Russia Eastern Europe East Europe 2021    38
## 312    Sochi  Russia Eastern Europe East Europe 2021    38
## 313    Sochi  Russia Eastern Europe East Europe 2021    38
## 314    Sochi  Russia Eastern Europe East Europe 2021    38
## 315    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=5

make a palette with 5 colors

colors2 <-c(
      "v1" = "#FFFF19",  
      "v2" = "#FF8C1A",
      "v3" = "#77DD37",
      "v4" = "purple3",   
      "v5" = "#1E90FF"
        )

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k5_pie_all_countries.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k5_pie_all_countries_outlines.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k5_pie_all_countries_no_labs.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.2.3 k=18 map

Import the Q matrix (K18 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k18run2 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run002/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k18run2 <- k18run2 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k18run2)

## # A tibble: 6 × 18
##       X1    X2    X3    X4      X5    X6    X7      X8    X9   X10     X11   X12
##    <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1   1e-6  1e-6  1e-6  1e-6 2.77e-1  1e-6  1e-6 6.23e-1  1e-6  1e-6 1   e-6  1e-6
## 2   1e-6  1e-6  1e-6  1e-6 1.00e+0  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6
## 3   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6
## 4   1e-6  1e-6  1e-6  1e-6 5.60e-1  1e-6  1e-6 4.33e-1  1e-6  1e-6 6.86e-3  1e-6
## 5   1e-6  1e-6  1e-6  1e-6 2.69e-1  1e-6  1e-6 7.31e-1  1e-6  1e-6 1   e-6  1e-6
## 6   1e-6  1e-6  1e-6  1e-6 4.14e-1  1e-6  1e-6 3.85e-1  1e-6  1e-6 1   e-6  1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run2 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run2)

head(k18run2)

##    ind pop    X1    X2    X3    X4       X5    X6    X7       X8    X9   X10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##        X11   X12   X13   X14   X15      X16      X17   X18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06

Rename the columns

# Rename the columns starting from the third one
k18run2 <- k18run2 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k18run2)

##    ind pop    v1    v2    v3    v4       v5    v6    v7       v8    v9   v10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##        v11   v12   v13   v14   v15      v16      v17   v18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06

Merge with pops

# Add an index column to Q_tibble
k18run2$index <- seq_len(nrow(k18run2))

# Perform the merge as before
df1 <-
  merge(
    k18run2,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1    v2    v3    v4       v5    v6    v7       v8    v9   v10
## 310 SOC 1065 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 311 SOC 1066 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 312 SOC 1067 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 313 SOC 1068 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 314 SOC 1069 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 315 SOC 1070 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##          v11   v12   v13   v14   v15      v16      v17   v18 Latitude Longitude
## 310 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06 43.60042  39.74533
## 311 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042  39.74533
## 312 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06 43.60042  39.74533
## 313 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042  39.74533
## 314 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042  39.74533
## 315 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06 43.60042  39.74533
##     Pop_City Country         Region   Subregion Year order
## 310    Sochi  Russia Eastern Europe East Europe 2021    38
## 311    Sochi  Russia Eastern Europe East Europe 2021    38
## 312    Sochi  Russia Eastern Europe East Europe 2021    38
## 313    Sochi  Russia Eastern Europe East Europe 2021    38
## 314    Sochi  Russia Eastern Europe East Europe 2021    38
## 315    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=18

make a palette with 18 colors

colors2 <-c(
      "v1" = "#008080",
      "v2" = "#1E90FF",
      "v3" = "green4",
      "v4" = "#FFB347",
      "v5" = "#B20CD9", 
      "v6" = "orchid",
      "v7" = "blue",
      "v8" = "purple",  
      "v9" = "#75FAFF",
      "v10" = "#B22222",
      "v11" = "#F49AC2", 
      "v12" = "yellow2",
      "v13" = "green",
      "v14" = "#FF8C1A",
      "v15" = "chocolate4",
      "v16" = "purple4",
      "v17" = "#77DD77", 
      "v18" = "#FFFF99"
      )

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k18_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k18_pie_all_countries_outlined.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.2.4 k=14 map

Import the Q matrix (K14 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k14run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.14.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k14run4 <- k14run4 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k14run4)

## # A tibble: 6 × 14
##         X1    X2    X3      X4    X5    X6    X7      X8    X9   X10   X11   X12
##      <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 6.09e-1 0.391  2e-6  2e-6  2e-6
## 2 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 2   e-6 1.00   2e-6  2e-6  2e-6
## 3 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 3.98e-1 0.602  2e-6  2e-6  2e-6
## 4 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 4.37e-1 0.563  2e-6  2e-6  2e-6
## 5 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 6.67e-1 0.333  2e-6  2e-6  2e-6
## 6 0.000002  2e-6  2e-6 1.02e-1  2e-6  2e-6  2e-6 4.70e-1 0.428  2e-6  2e-6  2e-6
## # ℹ 2 more variables: X13 <dbl>, X14 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k14run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k14run4)

head(k14run4)

##    ind pop    X1    X2    X3       X4    X5    X6    X7       X8       X9   X10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
##     X11   X12   X13   X14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06

Rename the columns

# Rename the columns starting from the third one
k14run4 <- k14run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k14run4)

##    ind pop    v1    v2    v3       v4    v5    v6    v7       v8       v9   v10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
##     v11   v12   v13   v14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06

Merge with pops

# Add an index column to Q_tibble
k14run4$index <- seq_len(nrow(k14run4))

# Perform the merge as before
df1 <-
  merge(
    k14run4,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1    v2    v3       v4    v5    v6    v7       v8       v9
## 310 SOC 1065 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201
## 311 SOC 1066 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977
## 312 SOC 1067 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095
## 313 SOC 1068 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699
## 314 SOC 1069 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931
## 315 SOC 1070 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289
##       v10   v11   v12   v13   v14 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 311 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 312 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 313 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 314 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 315 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
##             Region   Subregion Year order
## 310 Eastern Europe East Europe 2021    38
## 311 Eastern Europe East Europe 2021    38
## 312 Eastern Europe East Europe 2021    38
## 313 Eastern Europe East Europe 2021    38
## 314 Eastern Europe East Europe 2021    38
## 315 Eastern Europe East Europe 2021    38

Q-values for k=14

make a palette with 14 colors

colors2 <-c(
      "v1" = "#F49AC2",
      "v2" = "#77DD77",
      "v3" = "blue",
      "v4" = "#FF8C1A",
      "v5" = "#B20CC9", 
      "v6" = "green4",
      "v7" = "#B22222",
      "v8" = "purple4",  
      "v9" = "purple",
      "v10" = "chocolate4",
      "v11" = "#1E90FF", 
      "v12" = "#008080",
      "v13" = "green",
      "v14" = "yellow2"
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k14_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k14_pie_all_countries_outlined.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.2.5 k=13 map

Import the Q matrix (K13 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run8 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run008/simple.13.meanQ"
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k13run8 <- k13run8 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k13run8)

## # A tibble: 6 × 13
##         X1    X2      X3      X4    X5      X6    X7    X8      X9     X10   X11
##      <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>
## 1 0.546    0.443    2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 1.17e-2 2   e-6  2e-6
## 2 0.000002 1.00     2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 3 0.000002 1.00     2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 4 0.000002 0.843    2e-6 2   e-6  2e-6 1.57e-1  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 5 0.483    0.475    2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 4.20e-2 2   e-6  2e-6
## 6 0.313    0.577    2e-6 2.64e-2  2e-6 2   e-6  2e-6  2e-6 2   e-6 8.40e-2  2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run8 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run8)

head(k13run8)

##    ind pop       X1       X2    X3       X4    X5       X6    X7    X8       X9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
##        X10   X11   X12   X13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06

Rename the columns

# Rename the columns starting from the third one
k13run8 <- k13run8 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run8)

##    ind pop       v1       v2    v3       v4    v5       v6    v7    v8       v9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
##        v10   v11   v12   v13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06

Merge with pops

# Add an index column to Q_tibble
k13run8$index <- seq_len(nrow(k13run8))

# Perform the merge as before
df1 <-
  merge(
    k13run8,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1       v2    v3       v4    v5       v6    v7    v8
## 310 SOC 1065 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 311 SOC 1066 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 312 SOC 1067 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 313 SOC 1068 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06
## 314 SOC 1069 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 315 SOC 1070 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06
##           v9      v10   v11   v12   v13 Latitude Longitude Pop_City Country
## 310 0.011724 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 311 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 312 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 313 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 314 0.042005 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 315 0.000002 0.084006 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
##             Region   Subregion Year order
## 310 Eastern Europe East Europe 2021    38
## 311 Eastern Europe East Europe 2021    38
## 312 Eastern Europe East Europe 2021    38
## 313 Eastern Europe East Europe 2021    38
## 314 Eastern Europe East Europe 2021    38
## 315 Eastern Europe East Europe 2021    38

Q-values for k=13

make a palette with 13 colors

colors2 <-c(
      "v1" = "purple4",
      "v2" = "purple",
      "v3" = "blue",
      "v4" = "#FF8C1A",
      "v5" = "chocolate4", 
      "v6" = "#B20CD9",
      "v7" = "#B22222",
      "v8" = "#1E90FF",  
      "v9" = "green",
      "v10" = "yellow2",
      "v11" = "#77DD77",  
      "v12" = "#008080",
      "v13" = "#F49AC2"
      )

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k13_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

ggsave(
  here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k13_pie_all_countries_outlined.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3 Maps by subregion for MAF 1% (r2<0.01)

####5.3.1 k=15 Import the Q matrix (K15 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run07 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run007/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k15run07 <- k15run07 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k15run07)

## # A tibble: 6 × 15
##         X1    X2       X3     X4    X5    X6    X7    X8    X9   X10   X11   X12
##      <dbl> <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 2 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 3 0.000002 1.00  0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 4 0.000002 0.927 0.0732     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 5 0.000002 0.996 0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## 6 0.000002 0.999 0.000002   2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run07 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run07)

head(k15run07)

##    ind pop    X1       X2       X3    X4    X5    X6    X7    X8    X9   X10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##     X11   X12   X13   X14      X15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742

Rename the columns

# Rename the columns starting from the third one
k15run07 <- k15run07 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run07)

##    ind pop    v1       v2       v3    v4    v5    v6    v7    v8    v9   v10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##     v11   v12   v13   v14      v15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742

Merge with pops

# Add an index column to Q_tibble
k15run07$index <- seq_len(nrow(k15run07))

# Perform the merge as before
df1 <-
  merge(
    k15run07,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1       v2       v3    v4    v5    v6    v7    v8    v9   v10
## 310 SOC 1065 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 311 SOC 1066 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 312 SOC 1067 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 313 SOC 1068 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 314 SOC 1069 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 315 SOC 1070 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
##       v11   v12   v13   v14      v15 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 311 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 312 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 313 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042  39.74533    Sochi  Russia
## 314 2e-06 2e-06 2e-06 2e-06 0.004162 43.60042  39.74533    Sochi  Russia
## 315 2e-06 2e-06 2e-06 2e-06 0.000742 43.60042  39.74533    Sochi  Russia
##             Region   Subregion Year order
## 310 Eastern Europe East Europe 2021    38
## 311 Eastern Europe East Europe 2021    38
## 312 Eastern Europe East Europe 2021    38
## 313 Eastern Europe East Europe 2021    38
## 314 Eastern Europe East Europe 2021    38
## 315 Eastern Europe East Europe 2021    38

Q-values for k=15

make a palette with 15 colors

colors2 <-c(
      "v1" = "#F49AC2",
      "v2" = "purple",
      "v3" = "#FF8C1A",
      "v4" = "#77DD77",
      "v5" = "#1E90FF", 
      "v6" = "blue",
      "v7" = "green",
      "v8" = "yellow2",  
      "v9" = "#008080",
      "v10" = "#B22222",
      "v11" = "#75FAFF", 
      "v12" = "chocolate4",
      "v13" = "purple4",
      "v14" = "green4",
      "v15" = "#FFB347"
      )
colors2

##           v1           v2           v3           v4           v5           v6 
##    "#F49AC2"     "purple"    "#FF8C1A"    "#77DD77"    "#1E90FF"       "blue" 
##           v7           v8           v9          v10          v11          v12 
##      "green"    "yellow2"    "#008080"    "#B22222"    "#75FAFF" "chocolate4" 
##          v13          v14          v15 
##    "purple4"     "green4"    "#FFB347"

5.3.1.1 zoom in on Italy

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_italy.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_italy_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.1.2 zoom in on Eastern Europe

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)

selected_countries3 <- world 
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
 # filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_fareast.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")

selected_countries3 <- world 

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_fareast_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.1.3 zoom in on Iberian peninsula

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_iberia.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_iberia_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.1.4 zoom in on Balkans

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_balkans_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_balkans.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.2 k=14

Import the Q matrix (K14 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k14run4 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.14.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k14run4 <- k14run4 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k14run4)

## # A tibble: 6 × 14
##         X1    X2    X3      X4    X5    X6    X7      X8    X9   X10   X11   X12
##      <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 6.09e-1 0.391  2e-6  2e-6  2e-6
## 2 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 2   e-6 1.00   2e-6  2e-6  2e-6
## 3 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 3.98e-1 0.602  2e-6  2e-6  2e-6
## 4 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 4.37e-1 0.563  2e-6  2e-6  2e-6
## 5 0.000002  2e-6  2e-6 2   e-6  2e-6  2e-6  2e-6 6.67e-1 0.333  2e-6  2e-6  2e-6
## 6 0.000002  2e-6  2e-6 1.02e-1  2e-6  2e-6  2e-6 4.70e-1 0.428  2e-6  2e-6  2e-6
## # ℹ 2 more variables: X13 <dbl>, X14 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k14run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k14run4)

head(k14run4)

##    ind pop    X1    X2    X3       X4    X5    X6    X7       X8       X9   X10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
##     X11   X12   X13   X14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06

Rename the columns

# Rename the columns starting from the third one
k14run4 <- k14run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k14run4)

##    ind pop    v1    v2    v3       v4    v5    v6    v7       v8       v9   v10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
##     v11   v12   v13   v14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06

Merge with pops

# Add an index column to Q_tibble
k14run4$index <- seq_len(nrow(k14run4))

# Perform the merge as before
df1 <-
  merge(
    k14run4,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1    v2    v3       v4    v5    v6    v7       v8       v9
## 310 SOC 1065 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201
## 311 SOC 1066 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977
## 312 SOC 1067 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095
## 313 SOC 1068 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699
## 314 SOC 1069 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931
## 315 SOC 1070 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289
##       v10   v11   v12   v13   v14 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 311 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 312 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 313 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 314 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 315 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
##             Region   Subregion Year order
## 310 Eastern Europe East Europe 2021    38
## 311 Eastern Europe East Europe 2021    38
## 312 Eastern Europe East Europe 2021    38
## 313 Eastern Europe East Europe 2021    38
## 314 Eastern Europe East Europe 2021    38
## 315 Eastern Europe East Europe 2021    38

Q-values for k=14

make a palette with 14 colors

colors2 <-c(
      "v1" = "#F49AC2",
      "v2" = "#77DD77",
      "v3" = "blue",
      "v4" = "#FF8C1A",
      "v5" = "#B20CC9", 
      "v6" = "green4",
      "v7" = "#B22222",
      "v8" = "purple4",  
      "v9" = "purple",
      "v10" = "chocolate4",
      "v11" = "#1E90FF", 
      "v12" = "#008080",
      "v13" = "green",
      "v14" = "yellow2"
)

5.3.2.1 zoom in on Italy

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_italy.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_italy_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.2.2 zoom in on Eastern Europe

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)

selected_countries3 <- world 
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
 # filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_fareast.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")

selected_countries3 <- world 

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_fareast_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.2.3 zoom in on Iberian peninsula

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_iberia.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_iberia_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.2.4 zoom in on Balkans

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_balkans_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_balkans.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.3 k=13

Import the Q matrix (K13 for fastStructure) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run8 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run008/simple.13.meanQ"
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k13run8 <- k13run8 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k13run8)

## # A tibble: 6 × 13
##         X1    X2      X3      X4    X5      X6    X7    X8      X9     X10   X11
##      <dbl> <dbl>   <dbl>   <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>
## 1 0.546    0.443    2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 1.17e-2 2   e-6  2e-6
## 2 0.000002 1.00     2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 3 0.000002 1.00     2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 4 0.000002 0.843    2e-6 2   e-6  2e-6 1.57e-1  2e-6  2e-6 2   e-6 2   e-6  2e-6
## 5 0.483    0.475    2e-6 2   e-6  2e-6 2   e-6  2e-6  2e-6 4.20e-2 2   e-6  2e-6
## 6 0.313    0.577    2e-6 2.64e-2  2e-6 2   e-6  2e-6  2e-6 2   e-6 8.40e-2  2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run8 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run8)

head(k13run8)

##    ind pop       X1       X2    X3       X4    X5       X6    X7    X8       X9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
##        X10   X11   X12   X13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06

Rename the columns

# Rename the columns starting from the third one
k13run8 <- k13run8 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run8)

##    ind pop       v1       v2    v3       v4    v5       v6    v7    v8       v9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
##        v10   v11   v12   v13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06

Merge with pops

# Add an index column to Q_tibble
k13run8$index <- seq_len(nrow(k13run8))

# Perform the merge as before
df1 <-
  merge(
    k13run8,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1       v2    v3       v4    v5       v6    v7    v8
## 310 SOC 1065 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 311 SOC 1066 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 312 SOC 1067 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 313 SOC 1068 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06
## 314 SOC 1069 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 315 SOC 1070 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06
##           v9      v10   v11   v12   v13 Latitude Longitude Pop_City Country
## 310 0.011724 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 311 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 312 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 313 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 314 0.042005 0.000002 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
## 315 0.000002 0.084006 2e-06 2e-06 2e-06 43.60042  39.74533    Sochi  Russia
##             Region   Subregion Year order
## 310 Eastern Europe East Europe 2021    38
## 311 Eastern Europe East Europe 2021    38
## 312 Eastern Europe East Europe 2021    38
## 313 Eastern Europe East Europe 2021    38
## 314 Eastern Europe East Europe 2021    38
## 315 Eastern Europe East Europe 2021    38

Q-values for k=13

make a palette with 13 colors

colors2 <-c(
      "v1" = "purple4",
      "v2" = "purple",
      "v3" = "blue",
      "v4" = "#FF8C1A",
      "v5" = "chocolate4", 
      "v6" = "#B20CD9",
      "v7" = "#B22222",
      "v8" = "#1E90FF",  
      "v9" = "green",
      "v10" = "yellow2",
      "v11" = "#77DD77",  
      "v12" = "#008080",
      "v13" = "#F49AC2"
      )

5.3.3.1 zoom in on Italy

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_italy.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_italy_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.3.2 zoom in on Eastern Europe

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)

selected_countries3 <- world 
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
 # filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_fareast.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")

selected_countries3 <- world 

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_fareast_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.3.3 zoom in on Iberian peninsula

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_iberia.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_iberia_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.3.4 zoom in on Balkans

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_balkans_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_balkans.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.4 k=18

Import the Q matrix (K18 for fastStructure)

# Extract ancestry coefficients
k18run2 <- read_delim(
  here(
    "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run002/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
  ),
  delim = "  ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k18run2 <- k18run2 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k18run2)

## # A tibble: 6 × 18
##       X1    X2    X3    X4      X5    X6    X7      X8    X9   X10     X11   X12
##    <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1   1e-6  1e-6  1e-6  1e-6 2.77e-1  1e-6  1e-6 6.23e-1  1e-6  1e-6 1   e-6  1e-6
## 2   1e-6  1e-6  1e-6  1e-6 1.00e+0  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6
## 3   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6  1e-6 1   e-6  1e-6
## 4   1e-6  1e-6  1e-6  1e-6 5.60e-1  1e-6  1e-6 4.33e-1  1e-6  1e-6 6.86e-3  1e-6
## 5   1e-6  1e-6  1e-6  1e-6 2.69e-1  1e-6  1e-6 7.31e-1  1e-6  1e-6 1   e-6  1e-6
## 6   1e-6  1e-6  1e-6  1e-6 4.14e-1  1e-6  1e-6 3.85e-1  1e-6  1e-6 1   e-6  1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run2 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run2)

head(k18run2)

##    ind pop    X1    X2    X3    X4       X5    X6    X7       X8    X9   X10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##        X11   X12   X13   X14   X15      X16      X17   X18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06

Rename the columns

# Rename the columns starting from the third one
k18run2 <- k18run2 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k18run2)

##    ind pop    v1    v2    v3    v4       v5    v6    v7       v8    v9   v10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##        v11   v12   v13   v14   v15      v16      v17   v18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06

Merge with pops

# Add an index column to Q_tibble
k18run2$index <- seq_len(nrow(k18run2))

# Perform the merge as before
df1 <-
  merge(
    k18run2,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind    v1    v2    v3    v4       v5    v6    v7       v8    v9   v10
## 310 SOC 1065 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 311 SOC 1066 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 312 SOC 1067 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 313 SOC 1068 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 314 SOC 1069 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 315 SOC 1070 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
##          v11   v12   v13   v14   v15      v16      v17   v18 Latitude Longitude
## 310 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06 43.60042  39.74533
## 311 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042  39.74533
## 312 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06 43.60042  39.74533
## 313 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042  39.74533
## 314 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042  39.74533
## 315 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06 43.60042  39.74533
##     Pop_City Country         Region   Subregion Year order
## 310    Sochi  Russia Eastern Europe East Europe 2021    38
## 311    Sochi  Russia Eastern Europe East Europe 2021    38
## 312    Sochi  Russia Eastern Europe East Europe 2021    38
## 313    Sochi  Russia Eastern Europe East Europe 2021    38
## 314    Sochi  Russia Eastern Europe East Europe 2021    38
## 315    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=18

make a palette with 18 colors

colors2 <-c(
      "v1" = "#008080",
      "v2" = "#1E90FF",
      "v3" = "green4",
      "v4" = "#FFB347",
      "v5" = "#B20CD9", 
      "v6" = "orchid",
      "v7" = "blue",
      "v8" = "purple",  
      "v9" = "#75FAFF",
      "v10" = "#B22222",
      "v11" = "#F49AC2", 
      "v12" = "yellow2",
      "v13" = "green",
      "v14" = "#FF8C1A",
      "v15" = "chocolate4",
      "v16" = "purple4",
      "v17" = "#77DD77", 
      "v18" = "#FFFF99"
      )

5.3.4.1 zoom in on Italy

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_italy.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
  filter(admin=="Italy")

# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |> 
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries2, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_italy_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.4.2 zoom in on Eastern Europe

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)

selected_countries3 <- world 
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
 # filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), 
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_fareast.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")

selected_countries3 <- world 

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), 
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_fareast_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.4.3 zoom in on Iberian peninsula

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), 
               
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_iberia.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)

# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
  filter(admin %in% countries_with_data2)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), 
                color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_iberia_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

5.3.3.4 zoom in on Balkans

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                 cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
                  color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_balkans_labels.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")


selected_countries3 <- world 


# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries3, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 0.7), 
                cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
                  color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
  my_theme()

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_balkans.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Admixture

6. Set 2: r2<0.1 dataset from admixture

6.1 Admixture r2<0.1 structure plots

6.1.1 k=15

Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run4 <- read_delim(
  here("output", "europe", "admixture", "r2_0.1.15.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k15run4)

## # A tibble: 6 × 15
##        X1      X2      X3      X4       X5      X6      X7      X8      X9
##     <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.361   0.0551  0.00001 0.00001 0.00342  0.00001 0.00001 0.00001 0.00001
## 2 0.892   0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001
## 3 0.00556 0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001
## 4 0.575   0.0486  0.00730 0.00001 0.00001  0.00001 0.00001 0.00001 0.00577
## 5 0.390   0.0630  0.00001 0.0300  0.000727 0.00793 0.00452 0.00001 0.00001
## 6 0.429   0.0622  0.00001 0.00001 0.0177   0.00001 0.00001 0.0188  0.0319 
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## #   X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run4)

head(k15run4)

##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1065 SOC 0.361376 0.055147 0.000010 0.000010 0.003424 0.000010 0.000010
## 2 1066 SOC 0.892415 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.005563 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.574814 0.048574 0.007302 0.000010 0.000010 0.000010 0.000010
## 5 1069 SOC 0.390191 0.063016 0.000010 0.029971 0.000727 0.007935 0.004521
## 6 1070 SOC 0.429234 0.062239 0.000010 0.000010 0.017672 0.000010 0.000010
##         X8       X9      X10      X11      X12     X13      X14      X15
## 1 0.000010 0.000010 0.356106 0.000010 0.223847 0.00001 0.000010 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.107455 0.00001 0.000010 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.994307 0.00001 0.000010 0.000010
## 4 0.000010 0.005773 0.312298 0.000849 0.023455 0.00305 0.004865 0.018971
## 5 0.000010 0.000010 0.482465 0.000010 0.019218 0.00001 0.000010 0.001896
## 6 0.018843 0.031885 0.259303 0.000010 0.175995 0.00001 0.004243 0.000528

Rename the columns

# Rename the columns starting from the third one
k15run4 <- k15run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run4)

##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1065 SOC 0.361376 0.055147 0.000010 0.000010 0.003424 0.000010 0.000010
## 2 1066 SOC 0.892415 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.005563 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.574814 0.048574 0.007302 0.000010 0.000010 0.000010 0.000010
## 5 1069 SOC 0.390191 0.063016 0.000010 0.029971 0.000727 0.007935 0.004521
## 6 1070 SOC 0.429234 0.062239 0.000010 0.000010 0.017672 0.000010 0.000010
##         v8       v9      v10      v11      v12     v13      v14      v15
## 1 0.000010 0.000010 0.356106 0.000010 0.223847 0.00001 0.000010 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.107455 0.00001 0.000010 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.994307 0.00001 0.000010 0.000010
## 4 0.000010 0.005773 0.312298 0.000849 0.023455 0.00305 0.004865 0.018971
## 5 0.000010 0.000010 0.482465 0.000010 0.019218 0.00001 0.000010 0.001896
## 6 0.018843 0.031885 0.259303 0.000010 0.175995 0.00001 0.004243 0.000528

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k15run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
  c(
    "blue", 
    "#77DD77",
    "#F49AC2",      
    "green4",    
    "yellow2",
    "#FFFF99",      
    "purple4",  
    "chocolate4",
    "#FF8C1A",   
    "#008080",  
    "#B20CD9", 
    "green",        
    "#1E90FF",
    "purple", 
    "#B22222"    
    )


 # Generate all potential variable names
all_variables <- paste0("v", 1:15)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n Admixture for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k15_r1_47k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.1.2 k=5

Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run4 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/run4/r2_0.1.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k5run4)

## # A tibble: 6 × 5
##         X1       X2      X3    X4      X5
##      <dbl>    <dbl>   <dbl> <dbl>   <dbl>
## 1 0.0101   0.000045 0.0314  0.946 0.0127 
## 2 0.000012 0.00001  0.00001 1.00  0.00001
## 3 0.00001  0.00001  0.00002 1.00  0.00001
## 4 0.00947  0.0335   0.0581  0.899 0.00001
## 5 0.0220   0.00637  0.0356  0.889 0.0471 
## 6 0.0396   0.00998  0.0868  0.864 0.00001

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run4)

Rename the columns

# Rename the columns starting from the third one
k5run4 <- k5run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k5run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(   "#1E90FF",
     "red",
     "purple3", 
     "#FFFF19",      
     "#FF8C1A")
  

 # Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k5_r1_47k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

6.1.3 k=6

Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k6run3 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/run3/r2_0.1.6.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k6run3)

## # A tibble: 6 × 6
##        X1       X2    X3      X4      X5      X6
##     <dbl>    <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.0245  0.00001  0.940 0.0120  0.0238  0.00001
## 2 0.00001 0.00001  1.00  0.00001 0.00001 0.00001
## 3 0.00001 0.00001  1.00  0.00001 0.00001 0.00001
## 4 0.0481  0.00001  0.893 0.00001 0.0466  0.0118 
## 5 0.0243  0.000011 0.882 0.0473  0.0466  0.00001
## 6 0.0809  0.0226   0.860 0.00001 0.0365  0.00001

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k6run3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k6run3)

Rename the columns

# Rename the columns starting from the third one
k6run3 <- k6run3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k6run3 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
    "#FFFF19",   
    "#77DD37",    
    "#FF8C1A", 
    "red",
    "purple3",
    "#1E90FF"    
    )

 # Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture for k1:40 with 47,484 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k6_r1_47k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

7. Set 1: r2<0.01 dataset from admixture

7.1 Admixture r2<0.01 structure plots

7.1.1 k=15

Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run4 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run4/r2_0.01.15.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k15run4)

## # A tibble: 6 × 15
##         X1      X2      X3      X4       X5      X6      X7      X8      X9
##      <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.00001  0.00001 0.00001 0.0802  0.00001  0.00001 0.511   0.00001 0.00001
## 2 0.00001  0.00001 0.00001 0.00001 0.000011 0.00001 0.00001 0.00001 0.00001
## 3 0.00001  0.00001 0.802   0.00001 0.00001  0.00001 0.00001 0.00001 0.00001
## 4 0.000302 0.00001 0.00001 0.0385  0.0181   0.00721 0.310   0.0385  0.0240 
## 5 0.00001  0.0311  0.0327  0.0640  0.00001  0.00001 0.452   0.00855 0.00728
## 6 0.0141   0.00001 0.223   0.0812  0.00001  0.00663 0.232   0.00001 0.00001
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## #   X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run4)

head(k15run4)

##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1065 SOC 0.000010 0.000010 0.000010 0.080208 0.000010 0.000010 0.511214
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 0.000011 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801950 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.000302 0.000010 0.000010 0.038536 0.018128 0.007211 0.310458
## 5 1069 SOC 0.000010 0.031117 0.032690 0.064019 0.000010 0.000010 0.452264
## 6 1070 SOC 0.014112 0.000010 0.223026 0.081151 0.000010 0.006632 0.231936
##         X8       X9      X10      X11   X12      X13     X14      X15
## 1 0.000010 0.000010 0.000010 0.369351 1e-05 0.039116 0.00001 0.000010
## 2 0.000010 0.000010 0.000010 0.279744 1e-05 0.720126 0.00001 0.000010
## 3 0.000010 0.000010 0.000010 0.073619 1e-05 0.124311 0.00001 0.000010
## 4 0.038546 0.023974 0.000010 0.200266 1e-05 0.360237 0.00001 0.002292
## 5 0.008548 0.007282 0.046382 0.078845 1e-05 0.274773 0.00001 0.004030
## 6 0.000010 0.000010 0.031754 0.017665 1e-05 0.354584 0.03908 0.000010

Rename the columns

# Rename the columns starting from the third one
k15run4 <- k15run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run4)

##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1065 SOC 0.000010 0.000010 0.000010 0.080208 0.000010 0.000010 0.511214
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 0.000011 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801950 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.000302 0.000010 0.000010 0.038536 0.018128 0.007211 0.310458
## 5 1069 SOC 0.000010 0.031117 0.032690 0.064019 0.000010 0.000010 0.452264
## 6 1070 SOC 0.014112 0.000010 0.223026 0.081151 0.000010 0.006632 0.231936
##         v8       v9      v10      v11   v12      v13     v14      v15
## 1 0.000010 0.000010 0.000010 0.369351 1e-05 0.039116 0.00001 0.000010
## 2 0.000010 0.000010 0.000010 0.279744 1e-05 0.720126 0.00001 0.000010
## 3 0.000010 0.000010 0.000010 0.073619 1e-05 0.124311 0.00001 0.000010
## 4 0.038546 0.023974 0.000010 0.200266 1e-05 0.360237 0.00001 0.002292
## 5 0.008548 0.007282 0.046382 0.078845 1e-05 0.274773 0.00001 0.004030
## 6 0.000010 0.000010 0.031754 0.017665 1e-05 0.354584 0.03908 0.000010

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k15run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "#FF8C1A",       
    "purple4",       
    "#1E90FF",      
    "orchid",       
    "blue", 
    "purple",     
    "chocolate4",   
    "#77DD77",    
    "#B22222",
    "#F49AC2",      
    "green",      
    "yellow2",
    "#008080",      
    "#B20CD9",  
    "green4"
    )
  


 # Generate all potential variable names
all_variables <- paste0("v", 1:15)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n Admixture for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k15_r01_17k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

7.1.2 k=5

Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run4 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run4/r2_0.01.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k5run4)

## # A tibble: 6 × 5
##         X1      X2      X3    X4      X5
##      <dbl>   <dbl>   <dbl> <dbl>   <dbl>
## 1 0.00001  0.0449  0.0554  0.872 0.0274 
## 2 0.000829 0.0217  0.00001 0.977 0.00001
## 3 0.00001  0.00001 0.00001 1.00  0.00001
## 4 0.0159   0.0888  0.0678  0.827 0.00001
## 5 0.0171   0.0779  0.0479  0.810 0.0469 
## 6 0.0309   0.0210  0.127   0.821 0.00001

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run4)

Rename the columns

# Rename the columns starting from the third one
k5run4 <- k5run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k5run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
     "#FFFF19",   
     "red",
     "purple3",
     "#1E90FF",
     "#FF8C1A")


 # Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k5_r01_17k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

7.1.3 k=6

Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k6run1 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run1/r2_0.01.6.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k6run1)

## # A tibble: 6 × 6
##        X1    X2       X3      X4      X5      X6
##     <dbl> <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.0190  0.867 0.00001  0.0224  0.0407  0.0511 
## 2 0.00001 0.973 0.000012 0.00001 0.0274  0.00001
## 3 0.00001 1.00  0.00001  0.00001 0.00001 0.00001
## 4 0.0225  0.820 0.00807  0.00001 0.0990  0.0508 
## 5 0.00997 0.802 0.00693  0.0542  0.104   0.0231 
## 6 0.00001 0.811 0.0240   0.00001 0.0530  0.112

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k6run1 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k6run1)

Rename the columns

# Rename the columns starting from the third one
k6run1 <- k6run1 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k6run1 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
    "#1E90FF",
    "purple3",    
    "#FFFF19",   
    "#77DD37",  
    "#FF8C1A", 
    "red"    
    )
 
 # Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture for k1:40 with 17,028 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k6_r01_17k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

7.1.4 k=13

Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run4 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run4/r2_0.01.13.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k13run4)

## # A tibble: 6 × 13
##         X1      X2      X3      X4      X5     X6      X7      X8     X9     X10
##      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>  <dbl>   <dbl>
## 1 0.00001  0.394   9.30e-3 0.0164  0.0179  0.273  0.00001 1.5 e-5 0.222  0.00001
## 2 0.00001  0.0192  1   e-5 0.00001 0.00001 0.827  0.00001 1   e-5 0.118  0.00001
## 3 0.00001  0.00001 1   e-5 0.00001 0.00001 0.0938 0.00001 1   e-5 0.906  0.00001
## 4 0.000074 0.329   6.18e-3 0.00233 0.0441  0.497  0.0180  1   e-5 0.0280 0.00698
## 5 0.00001  0.465   7.41e-4 0.00001 0.0112  0.345  0.00001 3.21e-2 0.0283 0.0431 
## 6 0.0164   0.242   1   e-5 0.0391  0.00001 0.381  0.00716 1   e-5 0.199  0.0238 
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run4)

head(k13run4)

##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1065 SOC 0.000010 0.394353 0.009296 0.016374 0.017866 0.273391 0.000010
## 2 1066 SOC 0.000010 0.019216 0.000010 0.000010 0.000010 0.826863 0.000010
## 3 1067 SOC 0.000010 0.000010 0.000010 0.000010 0.000010 0.093768 0.000010
## 4 1068 SOC 0.000074 0.329223 0.006175 0.002333 0.044095 0.496514 0.017950
## 5 1069 SOC 0.000010 0.464799 0.000741 0.000010 0.011165 0.345466 0.000010
## 6 1070 SOC 0.016362 0.242028 0.000010 0.039087 0.000010 0.381455 0.007155
##         X8       X9      X10      X11      X12      X13
## 1 0.000015 0.221642 0.000010 0.000010 0.067012 0.000010
## 2 0.000010 0.117549 0.000010 0.036281 0.000010 0.000010
## 3 0.000010 0.906122 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.027967 0.006979 0.025743 0.038663 0.004274
## 5 0.032062 0.028323 0.043070 0.002007 0.066806 0.005531
## 6 0.000010 0.198886 0.023795 0.000010 0.091182 0.000010

Rename the columns

# Rename the columns starting from the third one
k13run4 <- k13run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run4)

##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1065 SOC 0.000010 0.394353 0.009296 0.016374 0.017866 0.273391 0.000010
## 2 1066 SOC 0.000010 0.019216 0.000010 0.000010 0.000010 0.826863 0.000010
## 3 1067 SOC 0.000010 0.000010 0.000010 0.000010 0.000010 0.093768 0.000010
## 4 1068 SOC 0.000074 0.329223 0.006175 0.002333 0.044095 0.496514 0.017950
## 5 1069 SOC 0.000010 0.464799 0.000741 0.000010 0.011165 0.345466 0.000010
## 6 1070 SOC 0.016362 0.242028 0.000010 0.039087 0.000010 0.381455 0.007155
##         v8       v9      v10      v11      v12      v13
## 1 0.000015 0.221642 0.000010 0.000010 0.067012 0.000010
## 2 0.000010 0.117549 0.000010 0.036281 0.000010 0.000010
## 3 0.000010 0.906122 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.027967 0.006979 0.025743 0.038663 0.004274
## 5 0.032062 0.028323 0.043070 0.002007 0.066806 0.005531
## 6 0.000010 0.198886 0.023795 0.000010 0.091182 0.000010

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k13run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "blue",
    "green",
    "#1E90FF",      
    "#B20CD9",  
    "#FF8C1A",      
    "#B22222",    
    "#F49AC2",  
    "#77DD77",        
    "green4",
    "#008080",  
    "purple4",     
    "purple", 
    "yellow2"
    )


 # Generate all potential variable names
all_variables <- paste0("v", 1:13)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n Admixture for 17,028 SNPs (Set 1).") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "admixture_europe_k13_r01_17k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8. Set 3: MAF 1% (r2<0.01) dataset from admixture

8.1 Admixture MAF 1% r2<0.01 structure plots

8.1.1 k=15

Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run4 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run4/r2_0.01_b.15.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k15run4)

## # A tibble: 6 × 15
##        X1      X2      X3       X4    X5      X6      X7      X8      X9     X10
##     <dbl>   <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.00001 0.00001 0.00491 0.000013  1e-5 0.00001 0.524   0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.00001 0.00001   1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.802   0.00001   1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.00187 0.00001 0.00001 0.00001   1e-5 0.0177  0.332   0.0316  0.0252  0.00001
## 5 0.00001 0.0225  0.0319  0.00001   1e-5 0.00001 0.455   0.00589 0.00803 0.0362 
## 6 0.0125  0.00001 0.227   0.0354    1e-5 0.0142  0.236   0.00001 0.00001 0.00559
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run4)

head(k15run4)

##    ind pop       X1       X2       X3       X4    X5       X6       X7       X8
## 1 1065 SOC 0.000010 0.000010 0.004907 0.000013 1e-05 0.000010 0.524182 0.000010
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801779 0.000010 1e-05 0.000010 0.000010 0.000010
## 4 1068 SOC 0.001872 0.000010 0.000010 0.000010 1e-05 0.017655 0.332402 0.031559
## 5 1069 SOC 0.000010 0.022489 0.031860 0.000010 1e-05 0.000010 0.455069 0.005887
## 6 1070 SOC 0.012493 0.000010 0.226502 0.035354 1e-05 0.014213 0.236432 0.000010
##         X9      X10      X11   X12      X13     X14      X15
## 1 0.000010 0.000010 0.348159 1e-05 0.059473 0.00001 0.063177
## 2 0.000010 0.000010 0.268918 1e-05 0.730952 0.00001 0.000010
## 3 0.000010 0.000010 0.083907 1e-05 0.114194 0.00001 0.000010
## 4 0.025172 0.000010 0.190328 1e-05 0.371940 0.00001 0.029002
## 5 0.008030 0.036174 0.074614 1e-05 0.300856 0.00813 0.056840
## 6 0.000010 0.005589 0.006392 1e-05 0.380527 0.00001 0.082439

Rename the columns

# Rename the columns starting from the third one
k15run4 <- k15run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k15run4)

##    ind pop       v1       v2       v3       v4    v5       v6       v7       v8
## 1 1065 SOC 0.000010 0.000010 0.004907 0.000013 1e-05 0.000010 0.524182 0.000010
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801779 0.000010 1e-05 0.000010 0.000010 0.000010
## 4 1068 SOC 0.001872 0.000010 0.000010 0.000010 1e-05 0.017655 0.332402 0.031559
## 5 1069 SOC 0.000010 0.022489 0.031860 0.000010 1e-05 0.000010 0.455069 0.005887
## 6 1070 SOC 0.012493 0.000010 0.226502 0.035354 1e-05 0.014213 0.236432 0.000010
##         v9      v10      v11   v12      v13     v14      v15
## 1 0.000010 0.000010 0.348159 1e-05 0.059473 0.00001 0.063177
## 2 0.000010 0.000010 0.268918 1e-05 0.730952 0.00001 0.000010
## 3 0.000010 0.000010 0.083907 1e-05 0.114194 0.00001 0.000010
## 4 0.025172 0.000010 0.190328 1e-05 0.371940 0.00001 0.029002
## 5 0.008030 0.036174 0.074614 1e-05 0.300856 0.00813 0.056840
## 6 0.000010 0.005589 0.006392 1e-05 0.380527 0.00001 0.082439

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k15run4 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "purple4",        
    "#B20CD9", 
    "#F49AC2",    
    "blue",
    "chocolate4",      
    "#FF8C1A",       
    "#B22222",
    "green4",
    "purple",
    "orchid",    
    "yellow2",
    "#008080",  
    "#77DD77",  
    "#1E90FF",  
    "green"
)
  
 # Generate all potential variable names
all_variables <- paste0("v", 1:15)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                         color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n Admixture for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "MAF_1", "admixture_europe_k15_MAF1_r01_20k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.1.2 k=5

Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run2 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run2/r2_0.01_b.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k5run2)

## # A tibble: 6 × 5
##        X1      X2    X3      X4      X5
##     <dbl>   <dbl> <dbl>   <dbl>   <dbl>
## 1 0.00454 0.0590  0.900 0.0363  0.00001
## 2 0.00001 0.00001 0.998 0.00166 0.00001
## 3 0.00001 0.00001 1.00  0.00001 0.00001
## 4 0.00362 0.0429  0.855 0.0918  0.00704
## 5 0.00001 0.0538  0.840 0.0960  0.0102 
## 6 0.00001 0.0922  0.848 0.0405  0.0196

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run2 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run2)

Rename the columns

# Rename the columns starting from the third one
k5run2 <- k5run2 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k5run2 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
     "#FF8C1A",
     "#FFFF19", 
     "#77DD37",    
     "purple3",
     "#1E90FF"
     )
  

 # Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "MAF_1", "admixture_europe_k5_MAF1_r01_20k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.1.3 k=6

Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k6run3 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.6.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k6run3)

## # A tibble: 6 × 6
##        X1      X2    X3      X4      X5      X6
##     <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.0558  0.00001 0.898 0.0130  0.0298  0.00373
## 2 0.00001 0.00001 0.997 0.00001 0.00311 0.00001
## 3 0.00001 0.00001 1.00  0.00001 0.00001 0.00001
## 4 0.0538  0.00598 0.852 0.00123 0.0848  0.00193
## 5 0.0142  0.00823 0.836 0.0474  0.0940  0.00001
## 6 0.115   0.0199  0.845 0.00001 0.0199  0.00001

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k6run3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k6run3)

Rename the columns

# Rename the columns starting from the third one
k6run3 <- k6run3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k6run3 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
c(
    "#1E90FF",  
    "red",
    "#FF8C1A", 
    "purple3",  
    "#FFFF19",     
    "#77DD37"
    )

 # Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "MAF_1", "admixture_europe_k6_MAF1_r01_20k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.1.4 k=13

Import the Q matrix (K13 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run3 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.13.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k13run3)

## # A tibble: 6 × 13
##       X1      X2      X3      X4      X5      X6      X7      X8      X9     X10
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.215  0.00001 0.00001 0.00001 0.00001 0.0503  0.00001 0.00001 0.00776 0.0249 
## 2 0.110  0.00001 0.00001 0.0133  0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.914  0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.0176 0.00001 0.0175  0.00001 0.00001 0.0294  0.00001 0.0278  0.0332  0.00463
## 5 0.0313 0.00932 0.0362  0.00001 0.0244  0.0580  0.00001 0.00001 0.00746 0.00001
## 6 0.194  0.00001 0.00001 0.00001 0.00001 0.0886  0.0143  0.0147  0.00001 0.0346 
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run3)

head(k13run3)

##    ind pop       X1       X2       X3      X4       X5       X6       X7
## 1 1065 SOC 0.214763 0.000010 0.000010 0.00001 0.000010 0.050272 0.000010
## 2 1066 SOC 0.109712 0.000010 0.000010 0.01334 0.000010 0.000010 0.000010
## 3 1067 SOC 0.914099 0.000010 0.000010 0.00001 0.000010 0.000010 0.000010
## 4 1068 SOC 0.017640 0.000010 0.017477 0.00001 0.000010 0.029389 0.000010
## 5 1069 SOC 0.031304 0.009315 0.036161 0.00001 0.024405 0.058041 0.000010
## 6 1070 SOC 0.193658 0.000010 0.000010 0.00001 0.000010 0.088579 0.014296
##         X8       X9      X10      X11      X12      X13
## 1 0.000010 0.007757 0.024929 0.406487 0.295402 0.000330
## 2 0.000010 0.000010 0.000010 0.000010 0.876847 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.085791 0.000010
## 4 0.027831 0.033217 0.004631 0.341965 0.524722 0.003088
## 5 0.000010 0.007463 0.000010 0.459753 0.373508 0.000010
## 6 0.014673 0.000010 0.034588 0.249920 0.404226 0.000010

Rename the columns

# Rename the columns starting from the third one
k13run3 <- k13run3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k13run3)

##    ind pop       v1       v2       v3      v4       v5       v6       v7
## 1 1065 SOC 0.214763 0.000010 0.000010 0.00001 0.000010 0.050272 0.000010
## 2 1066 SOC 0.109712 0.000010 0.000010 0.01334 0.000010 0.000010 0.000010
## 3 1067 SOC 0.914099 0.000010 0.000010 0.00001 0.000010 0.000010 0.000010
## 4 1068 SOC 0.017640 0.000010 0.017477 0.00001 0.000010 0.029389 0.000010
## 5 1069 SOC 0.031304 0.009315 0.036161 0.00001 0.024405 0.058041 0.000010
## 6 1070 SOC 0.193658 0.000010 0.000010 0.00001 0.000010 0.088579 0.014296
##         v8       v9      v10      v11      v12      v13
## 1 0.000010 0.007757 0.024929 0.406487 0.295402 0.000330
## 2 0.000010 0.000010 0.000010 0.000010 0.876847 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.085791 0.000010
## 4 0.027831 0.033217 0.004631 0.341965 0.524722 0.003088
## 5 0.000010 0.007463 0.000010 0.459753 0.373508 0.000010
## 6 0.014673 0.000010 0.034588 0.249920 0.404226 0.000010

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k13run3 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(
    "#008080",        
    "purple4",  
    "purple",
    "blue",
    "yellow2",
    "#FF8C1A",    
    "#1E90FF",      
    "green4",    
    "green",
    "#B22222",  
    "#77DD77",  
    "#B20CD9",     
    "#F49AC2"
    )

 # Generate all potential variable names
all_variables <- paste0("v", 1:13)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                         color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n Admixture for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "MAF_1", "admixture_europe_k13_MAF1_r01_20k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.1.5 k=14

Import the Q matrix (K14 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k14run3 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.14.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k14run3)

## # A tibble: 6 × 14
##        X1      X2      X3      X4      X5      X6      X7     X8      X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
## 1 0.00106 0.0508  1   e-5 0.00001 1   e-5 0.407   0.00001 0.215  0.0251  0.00765
## 2 0.00001 0.00001 1   e-5 0.00001 1   e-5 0.00001 0.00001 0.110  0.00001 0.00001
## 3 0.00001 0.00001 1   e-5 0.00001 1   e-5 0.00001 0.00001 0.914  0.00001 0.00001
## 4 0.00001 0.0293  1.76e-4 0.00001 5.52e-4 0.339   0.00001 0.0180 0.00522 0.0341 
## 5 0.00001 0.0567  1   e-5 0.0240  8.84e-3 0.460   0.0353  0.0308 0.00001 0.00513
## 6 0.00001 0.0871  1.31e-2 0.00001 1   e-5 0.250   0.00480 0.193  0.0331  0.00001
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k14run3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k14run3)

head(k14run3)

##    ind pop      X1       X2       X3       X4       X5       X6       X7
## 1 1065 SOC 0.00106 0.050765 0.000010 0.000010 0.000010 0.406851 0.000010
## 2 1066 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.00001 0.029296 0.000176 0.000010 0.000552 0.339113 0.000010
## 5 1069 SOC 0.00001 0.056695 0.000010 0.023968 0.008836 0.459668 0.035294
## 6 1070 SOC 0.00001 0.087083 0.013070 0.000010 0.000010 0.250427 0.004795
##         X8       X9      X10      X11      X12      X13      X14
## 1 0.214582 0.025094 0.007646 0.000010 0.000013 0.293928 0.000010
## 2 0.109817 0.000010 0.000010 0.013705 0.000010 0.876368 0.000010
## 3 0.914203 0.000010 0.000010 0.000010 0.000010 0.085677 0.000010
## 4 0.017951 0.005221 0.034125 0.000010 0.026843 0.523743 0.022941
## 5 0.030823 0.000010 0.005126 0.000010 0.007622 0.371918 0.000010
## 6 0.193435 0.033125 0.000010 0.000010 0.000010 0.402723 0.015282

Rename the columns

# Rename the columns starting from the third one
k14run3 <- k14run3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k14run3)

##    ind pop      v1       v2       v3       v4       v5       v6       v7
## 1 1065 SOC 0.00106 0.050765 0.000010 0.000010 0.000010 0.406851 0.000010
## 2 1066 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.00001 0.029296 0.000176 0.000010 0.000552 0.339113 0.000010
## 5 1069 SOC 0.00001 0.056695 0.000010 0.023968 0.008836 0.459668 0.035294
## 6 1070 SOC 0.00001 0.087083 0.013070 0.000010 0.000010 0.250427 0.004795
##         v8       v9      v10      v11      v12      v13      v14
## 1 0.214582 0.025094 0.007646 0.000010 0.000013 0.293928 0.000010
## 2 0.109817 0.000010 0.000010 0.013705 0.000010 0.876368 0.000010
## 3 0.914203 0.000010 0.000010 0.000010 0.000010 0.085677 0.000010
## 4 0.017951 0.005221 0.034125 0.000010 0.026843 0.523743 0.022941
## 5 0.030823 0.000010 0.005126 0.000010 0.007622 0.371918 0.000010
## 6 0.193435 0.033125 0.000010 0.000010 0.000010 0.402723 0.015282

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k14run3 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
    c(   
    "green4", 
    "purple",
    "chocolate4", 
    "yellow2",    
    "#FF8C1A",  
    "#B22222",
    "#B20CD9",
    "#1E90FF",  
    "blue",
    "#77DD77",  
    "purple4",  
    "#008080",      
    "green",
    "#F49AC2"
    ) 

 # Generate all potential variable names
all_variables <- paste0("v", 1:14)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                         color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=14.\n Admixture for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "MAF_1", "admixture_europe_k14_MAF1_r01_20k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.1.6 k=18

Import the Q matrix (K18 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k18run1 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run1/r2_0.01_b.18.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(k18run1)

## # A tibble: 6 × 18
##        X1      X2      X3      X4      X5     X6      X7      X8      X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.00001 0.00001 0.00001 0.0148  0.00001 0.0208 0.338   0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.0675  0.00001 0.00001 0.691  0.241   0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.00001 0.792   0.00001 0.128  0.0798  0.00001 0.00001 0.00001
## 4 0.00001 0.00001 0.171   0.00001 0.00079 0.374  0.183   0.00001 0.00001 0.0191 
## 5 0.00370 0.00001 0.0174  0.0301  0.00354 0.285  0.0677  0.0207  0.0496  0.00001
## 6 0.00001 0.0122  0.00001 0.226   0.00001 0.363  0.00001 0.00001 0.0140  0.0122 
## # ℹ 8 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run1 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run1)

head(k18run1)

##    ind pop       X1      X2       X3       X4       X5       X6       X7
## 1 1065 SOC 0.000010 0.00001 0.000010 0.014837 0.000010 0.020841 0.337713
## 2 1066 SOC 0.000010 0.00001 0.067510 0.000010 0.000010 0.691434 0.240906
## 3 1067 SOC 0.000010 0.00001 0.000010 0.792464 0.000010 0.127570 0.079816
## 4 1068 SOC 0.000010 0.00001 0.171399 0.000010 0.000790 0.374352 0.183001
## 5 1069 SOC 0.003705 0.00001 0.017397 0.030122 0.003544 0.285460 0.067722
## 6 1070 SOC 0.000010 0.01216 0.000010 0.226478 0.000010 0.362962 0.000010
##         X8       X9      X10      X11      X12   X13      X14   X15      X16
## 1 0.000010 0.000010 0.000010 0.560465 0.066013 1e-05 0.000010 1e-05 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 4 0.000010 0.000010 0.019120 0.229189 0.006437 1e-05 0.015612 1e-05 0.000010
## 5 0.020749 0.049572 0.000010 0.456702 0.053867 1e-05 0.011089 1e-05 0.000010
## 6 0.000010 0.014015 0.012198 0.256707 0.080054 1e-05 0.000010 1e-05 0.035326
##     X17   X18
## 1 1e-05 1e-05
## 2 1e-05 1e-05
## 3 1e-05 1e-05
## 4 1e-05 1e-05
## 5 1e-05 1e-05
## 6 1e-05 1e-05

Rename the columns

# Rename the columns starting from the third one
k18run1 <- k18run1 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(k18run1)

##    ind pop       v1      v2       v3       v4       v5       v6       v7
## 1 1065 SOC 0.000010 0.00001 0.000010 0.014837 0.000010 0.020841 0.337713
## 2 1066 SOC 0.000010 0.00001 0.067510 0.000010 0.000010 0.691434 0.240906
## 3 1067 SOC 0.000010 0.00001 0.000010 0.792464 0.000010 0.127570 0.079816
## 4 1068 SOC 0.000010 0.00001 0.171399 0.000010 0.000790 0.374352 0.183001
## 5 1069 SOC 0.003705 0.00001 0.017397 0.030122 0.003544 0.285460 0.067722
## 6 1070 SOC 0.000010 0.01216 0.000010 0.226478 0.000010 0.362962 0.000010
##         v8       v9      v10      v11      v12   v13      v14   v15      v16
## 1 0.000010 0.000010 0.000010 0.560465 0.066013 1e-05 0.000010 1e-05 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 4 0.000010 0.000010 0.019120 0.229189 0.006437 1e-05 0.015612 1e-05 0.000010
## 5 0.020749 0.049572 0.000010 0.456702 0.053867 1e-05 0.011089 1e-05 0.000010
## 6 0.000010 0.014015 0.012198 0.256707 0.080054 1e-05 0.000010 1e-05 0.035326
##     v17   v18
## 1 1e-05 1e-05
## 2 1e-05 1e-05
## 3 1e-05 1e-05
## 4 1e-05 1e-05
## 5 1e-05 1e-05
## 6 1e-05 1e-05

Import samples attributes

sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)

pops <- sampling_loc |>
  filter(
    Continent == "Europe"
  ) |>
  dplyr::select(
    Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
  )

head(pops)

##   Abbreviation Latitude Longitude             Pop_City  Country          Region
## 1          FRS 45.16531  5.771806 Saint-Martin-d'Heres   France  Western Europe
## 2          STS 48.61124  7.754512           Strasbourg   France  Western Europe
## 3          POP 41.18555 -8.329371             Penafiel Portugal Southern Europe
## 4          POL 37.09084 -8.092465                Loule Portugal Southern Europe
## 5          SPB 38.86622 -6.974194              Badajoz    Spain Southern Europe
## 6          SPS 36.17042 -5.371530            San Roque    Spain Southern Europe
##     Subregion Year order
## 1 West Europe 2019     1
## 2 West Europe 2019     2
## 3 West Europe 2017     3
## 4 West Europe 2017     4
## 5 West Europe 2018     5
## 6 West Europe 2017     6

source(
  here(
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- k18run1 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value")

# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Region_Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Region_Country = unique(Q_ordered$Region_Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Region_Country, function(rc)
    max(which(Q_ordered$Region_Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette <-
  c(
    "green4",    
    "#75FAFF",  
    "goldenrod",  
    "yellow2",     
    "#B22222",  
    "orchid", 
    "#F49AC2",     
    "purple",    
    "purple4",    
    "#B20CD9",      
    "#1E90FF",    
    "green",   
    "#77DD77",     
    "#FF8C1A",  
    "chocolate4",
    "#008080", 
    "#FFFF99",     
    "blue"  
    )
  

 # Generate all potential variable names
all_variables <- paste0("v", 1:18)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                         color = color_palette[1:length(all_variables)])

 # Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")


# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n Admixture for k1:25 with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here(
    "output", "europe", "admixture", "MAF_1", "admixture_europe_k18_MAF1_r01_20k.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.2 Pie-charts for MAF 1% r2<0.01 admixture (Set 3)

8.2.1 k=15 map

Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k15run4 <- read_delim(
  here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run4/r2_0.01_b.15.Q"#copy one of Q matricies with best k to here and rename it
  ),
  delim = " ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k15run4 <- k15run4 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k15run4)

## # A tibble: 6 × 15
##        X1      X2      X3       X4    X5      X6      X7      X8      X9     X10
##     <dbl>   <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.00001 0.00001 0.00491 0.000013  1e-5 0.00001 0.524   0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.00001 0.00001   1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.802   0.00001   1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.00187 0.00001 0.00001 0.00001   1e-5 0.0177  0.332   0.0316  0.0252  0.00001
## 5 0.00001 0.0225  0.0319  0.00001   1e-5 0.00001 0.455   0.00589 0.00803 0.0362 
## 6 0.0125  0.00001 0.227   0.0354    1e-5 0.0142  0.236   0.00001 0.00001 0.00559
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k15run4 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k15run4)

Rename the columns

# Rename the columns starting from the third one
k15run4 <- k15run4 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Merge with pops

# Add an index column to Q_tibble
k15run4$index <- seq_len(nrow(k15run4))

# Perform the merge as before
df1 <-
  merge(
    k15run4,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1       v2       v3       v4    v5       v6       v7
## 310 SOC 1065 0.000010 0.000010 0.004907 0.000013 1e-05 0.000010 0.524182
## 311 SOC 1066 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010
## 312 SOC 1067 0.000010 0.000010 0.801779 0.000010 1e-05 0.000010 0.000010
## 313 SOC 1068 0.001872 0.000010 0.000010 0.000010 1e-05 0.017655 0.332402
## 314 SOC 1069 0.000010 0.022489 0.031860 0.000010 1e-05 0.000010 0.455069
## 315 SOC 1070 0.012493 0.000010 0.226502 0.035354 1e-05 0.014213 0.236432
##           v8       v9      v10      v11   v12      v13     v14      v15
## 310 0.000010 0.000010 0.000010 0.348159 1e-05 0.059473 0.00001 0.063177
## 311 0.000010 0.000010 0.000010 0.268918 1e-05 0.730952 0.00001 0.000010
## 312 0.000010 0.000010 0.000010 0.083907 1e-05 0.114194 0.00001 0.000010
## 313 0.031559 0.025172 0.000010 0.190328 1e-05 0.371940 0.00001 0.029002
## 314 0.005887 0.008030 0.036174 0.074614 1e-05 0.300856 0.00813 0.056840
## 315 0.000010 0.000010 0.005589 0.006392 1e-05 0.380527 0.00001 0.082439
##     Latitude Longitude Pop_City Country         Region   Subregion Year order
## 310 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 311 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 312 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 313 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 314 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 315 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=15

make a palette with 15 colors

colors2 <-c(
      "v1" = "#B22222",
      "v2" = "#1E90FF", 
      "v3" = "#B20CD9", 
      "v4" = "#FFB347",
      "v5" = "green4",
      "v6" = "#008080",
      "v7" = "purple",
      "v8" = "#F49AC2",
      "v9" = "chocolate4",
      "v10" = "#FF8C1A",
      "v11" = "orchid",
      "v12" = "green",
      "v13" = "purple4",
      "v14" = "blue",
      "v15" = "yellow2"
      )
colors2

##           v1           v2           v3           v4           v5           v6 
##    "#B22222"    "#1E90FF"    "#B20CD9"    "#FFB347"     "green4"    "#008080" 
##           v7           v8           v9          v10          v11          v12 
##     "purple"    "#F49AC2" "chocolate4"    "#FF8C1A"     "orchid"      "green" 
##          v13          v14          v15 
##    "purple4"       "blue"    "yellow2"

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k15_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k15_pie_no_labs.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.2.2 k=5 map

Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k5run2 <- read_delim(
  here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run2/r2_0.01_b.5.Q"#copy one of Q matricies with best k to here and rename it
  ),
  delim = " ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k5run2 <- k5run2 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k5run2)

## # A tibble: 6 × 5
##        X1      X2    X3      X4      X5
##     <dbl>   <dbl> <dbl>   <dbl>   <dbl>
## 1 0.00454 0.0590  0.900 0.0363  0.00001
## 2 0.00001 0.00001 0.998 0.00166 0.00001
## 3 0.00001 0.00001 1.00  0.00001 0.00001
## 4 0.00362 0.0429  0.855 0.0918  0.00704
## 5 0.00001 0.0538  0.840 0.0960  0.0102 
## 6 0.00001 0.0922  0.848 0.0405  0.0196

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k5run2 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k5run2)

Rename the columns

# Rename the columns starting from the third one
k5run2 <- k5run2 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Merge with pops

# Add an index column to Q_tibble
k5run2$index <- seq_len(nrow(k5run2))

# Perform the merge as before
df1 <-
  merge(
    k5run2,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1       v2       v3       v4       v5 Latitude Longitude
## 310 SOC 1065 0.004541 0.059013 0.900098 0.036339 0.000010 43.60042  39.74533
## 311 SOC 1066 0.000010 0.000010 0.998310 0.001660 0.000010 43.60042  39.74533
## 312 SOC 1067 0.000010 0.000010 0.999960 0.000010 0.000010 43.60042  39.74533
## 313 SOC 1068 0.003617 0.042869 0.854715 0.091759 0.007040 43.60042  39.74533
## 314 SOC 1069 0.000010 0.053826 0.839905 0.096034 0.010225 43.60042  39.74533
## 315 SOC 1070 0.000010 0.092154 0.847758 0.040523 0.019555 43.60042  39.74533
##     Pop_City Country         Region   Subregion Year order
## 310    Sochi  Russia Eastern Europe East Europe 2021    38
## 311    Sochi  Russia Eastern Europe East Europe 2021    38
## 312    Sochi  Russia Eastern Europe East Europe 2021    38
## 313    Sochi  Russia Eastern Europe East Europe 2021    38
## 314    Sochi  Russia Eastern Europe East Europe 2021    38
## 315    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=5

make a palette with 5 colors

colors2 <-
c(
    "#77DD37",  
    "#1E90FF",
    "purple3",
    "#FF8C1A",
    "#FFFF19"    
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k5_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k5_pie_no_labs.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.2.3 k=18 map

Import the Q matrix (K18 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k18run1 <- read_delim(
  here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run1/r2_0.01_b.18.Q"#copy one of Q matricies with best k to here and rename it
  ),
  delim = " ",
  # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
)

# Using mutate and across to round all columns to 4 decimal places
k18run1 <- k18run1 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k18run1)

## # A tibble: 6 × 18
##        X1      X2      X3      X4      X5     X6      X7      X8      X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.00001 0.00001 0.00001 0.0148  0.00001 0.0208 0.338   0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.0675  0.00001 0.00001 0.691  0.241   0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.00001 0.792   0.00001 0.128  0.0798  0.00001 0.00001 0.00001
## 4 0.00001 0.00001 0.171   0.00001 0.00079 0.374  0.183   0.00001 0.00001 0.0191 
## 5 0.00370 0.00001 0.0174  0.0301  0.00354 0.285  0.0677  0.0207  0.0496  0.00001
## 6 0.00001 0.0122  0.00001 0.226   0.00001 0.363  0.00001 0.00001 0.0140  0.0122 
## # ℹ 8 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k18run1 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k18run1)

Rename the columns

# Rename the columns starting from the third one
k18run1 <- k18run1 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Merge with pops

# Add an index column to Q_tibble
k18run1$index <- seq_len(nrow(k18run1))

# Perform the merge as before
df1 <-
  merge(
    k18run1,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1      v2       v3       v4       v5       v6       v7
## 310 SOC 1065 0.000010 0.00001 0.000010 0.014837 0.000010 0.020841 0.337713
## 311 SOC 1066 0.000010 0.00001 0.067510 0.000010 0.000010 0.691434 0.240906
## 312 SOC 1067 0.000010 0.00001 0.000010 0.792464 0.000010 0.127570 0.079816
## 313 SOC 1068 0.000010 0.00001 0.171399 0.000010 0.000790 0.374352 0.183001
## 314 SOC 1069 0.003705 0.00001 0.017397 0.030122 0.003544 0.285460 0.067722
## 315 SOC 1070 0.000010 0.01216 0.000010 0.226478 0.000010 0.362962 0.000010
##           v8       v9      v10      v11      v12   v13      v14   v15      v16
## 310 0.000010 0.000010 0.000010 0.560465 0.066013 1e-05 0.000010 1e-05 0.000010
## 311 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 312 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 313 0.000010 0.000010 0.019120 0.229189 0.006437 1e-05 0.015612 1e-05 0.000010
## 314 0.020749 0.049572 0.000010 0.456702 0.053867 1e-05 0.011089 1e-05 0.000010
## 315 0.000010 0.014015 0.012198 0.256707 0.080054 1e-05 0.000010 1e-05 0.035326
##       v17   v18 Latitude Longitude Pop_City Country         Region   Subregion
## 310 1e-05 1e-05 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe
## 311 1e-05 1e-05 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe
## 312 1e-05 1e-05 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe
## 313 1e-05 1e-05 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe
## 314 1e-05 1e-05 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe
## 315 1e-05 1e-05 43.60042  39.74533    Sochi  Russia Eastern Europe East Europe
##     Year order
## 310 2021    38
## 311 2021    38
## 312 2021    38
## 313 2021    38
## 314 2021    38
## 315 2021    38

Q-values for k=18

make a palette with 18 colors

colors2 <-c(
      "v1" = "blue",
      "v2" = "#B22222", 
      "v3" = "goldenrod", 
      "v4" = "orchid", 
      "v5" = "#F49AC2",
      "v6" = "purple4",
      "v7" = "purple",
      "v8" = "#1E90FF",
      "v9" = "#FF8C1A",
      "v10" = "#77DD77",
      "v11" = "#B20CD9",
      "v12" = "yellow2",
      "v13" = "green4",
      "v14" = "chocolate4",
      "v15" = "green",
      "v16" = "#008080",    
      "v17" = "#75FAFF",
      "v18" = "#FFFF99"  
      )
colors2

##           v1           v2           v3           v4           v5           v6 
##       "blue"    "#B22222"  "goldenrod"     "orchid"    "#F49AC2"    "purple4" 
##           v7           v8           v9          v10          v11          v12 
##     "purple"    "#1E90FF"    "#FF8C1A"    "#77DD77"    "#B20CD9"    "yellow2" 
##          v13          v14          v15          v16          v17          v18 
##     "green4" "chocolate4"      "green"    "#008080"    "#75FAFF"    "#FFFF99"

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k18_pie_all_countries.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k18_pie_outlined.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.2.4 k=13 map

Import the Q matrix (K13 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k13run3 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.13.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
# Using mutate and across to round all columns to 4 decimal places
k13run3 <- k13run3 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k13run3)

## # A tibble: 6 × 13
##       X1      X2      X3      X4      X5      X6      X7      X8      X9     X10
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.215  0.00001 0.00001 0.00001 0.00001 0.0503  0.00001 0.00001 0.00776 0.0249 
## 2 0.110  0.00001 0.00001 0.0133  0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.914  0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.0176 0.00001 0.0175  0.00001 0.00001 0.0294  0.00001 0.0278  0.0332  0.00463
## 5 0.0313 0.00932 0.0362  0.00001 0.0244  0.0580  0.00001 0.00001 0.00746 0.00001
## 6 0.194  0.00001 0.00001 0.00001 0.00001 0.0886  0.0143  0.0147  0.00001 0.0346 
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k13run3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k13run3)

Rename the columns

# Rename the columns starting from the third one
k13run3 <- k13run3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Merge with pops

# Add an index column to Q_tibble
k13run3$index <- seq_len(nrow(k13run3))

# Perform the merge as before
df1 <-
  merge(
    k13run3,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind       v1       v2       v3      v4       v5       v6       v7
## 310 SOC 1065 0.214763 0.000010 0.000010 0.00001 0.000010 0.050272 0.000010
## 311 SOC 1066 0.109712 0.000010 0.000010 0.01334 0.000010 0.000010 0.000010
## 312 SOC 1067 0.914099 0.000010 0.000010 0.00001 0.000010 0.000010 0.000010
## 313 SOC 1068 0.017640 0.000010 0.017477 0.00001 0.000010 0.029389 0.000010
## 314 SOC 1069 0.031304 0.009315 0.036161 0.00001 0.024405 0.058041 0.000010
## 315 SOC 1070 0.193658 0.000010 0.000010 0.00001 0.000010 0.088579 0.014296
##           v8       v9      v10      v11      v12      v13 Latitude Longitude
## 310 0.000010 0.007757 0.024929 0.406487 0.295402 0.000330 43.60042  39.74533
## 311 0.000010 0.000010 0.000010 0.000010 0.876847 0.000010 43.60042  39.74533
## 312 0.000010 0.000010 0.000010 0.000010 0.085791 0.000010 43.60042  39.74533
## 313 0.027831 0.033217 0.004631 0.341965 0.524722 0.003088 43.60042  39.74533
## 314 0.000010 0.007463 0.000010 0.459753 0.373508 0.000010 43.60042  39.74533
## 315 0.014673 0.000010 0.034588 0.249920 0.404226 0.000010 43.60042  39.74533
##     Pop_City Country         Region   Subregion Year order
## 310    Sochi  Russia Eastern Europe East Europe 2021    38
## 311    Sochi  Russia Eastern Europe East Europe 2021    38
## 312    Sochi  Russia Eastern Europe East Europe 2021    38
## 313    Sochi  Russia Eastern Europe East Europe 2021    38
## 314    Sochi  Russia Eastern Europe East Europe 2021    38
## 315    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=13

make a palette with 13 colors

colors2 <-c(
      "v1" = "purple4",
      "v2" = "blue", 
      "v3" = "#FF8C1A", 
      "v4" = "green4",
      "v5" = "#1E90FF",
      "v6" = "yellow2",
      "v7" = "#B22222",
      "v8" = "#77DD77",
      "v9" = "#F49AC2",
      "v10" = "#008080",
      "v11" = "#B20CD9",
      "v12" = "purple",
      "v13" = "green"
      )

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k13_pie_all_countries.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = NA) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k13_pie_no_labs.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

8.2.5 k=14 map

Import the Q matrix (K14 for admixture) Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
k14run3 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.14.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

# Using mutate and across to round all columns to 4 decimal places
k14run3 <- k14run3 %>%
  mutate(across(everything(), ~ round(.x, 6)))

# Viewing the first few rows to verify the result
head(k14run3)

## # A tibble: 6 × 14
##        X1      X2      X3      X4      X5      X6      X7     X8      X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
## 1 0.00106 0.0508  1   e-5 0.00001 1   e-5 0.407   0.00001 0.215  0.0251  0.00765
## 2 0.00001 0.00001 1   e-5 0.00001 1   e-5 0.00001 0.00001 0.110  0.00001 0.00001
## 3 0.00001 0.00001 1   e-5 0.00001 1   e-5 0.00001 0.00001 0.914  0.00001 0.00001
## 4 0.00001 0.0293  1.76e-4 0.00001 5.52e-4 0.339   0.00001 0.0180 0.00522 0.0341 
## 5 0.00001 0.0567  1   e-5 0.0240  8.84e-3 0.460   0.0353  0.0308 0.00001 0.00513
## 6 0.00001 0.0871  1.31e-2 0.00001 1   e-5 0.250   0.00480 0.193  0.0331  0.00001
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>

The fam file

fam_file <- here(
  "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)

##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)

##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to matrix

k14run3 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(k14run3)

Rename the columns

# Rename the columns starting from the third one
k14run3 <- k14run3 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

Merge with pops

# Add an index column to Q_tibble
k14run3$index <- seq_len(nrow(k14run3))

# Perform the merge as before
df1 <-
  merge(
    k14run3,
    pops,
    by.x = 2,
    by.y = 1,
    all.x = T,
    all.y = F
  ) |>
  na.omit()

# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]

# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL

# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)

##     pop  ind      v1       v2       v3       v4       v5       v6       v7
## 310 SOC 1065 0.00106 0.050765 0.000010 0.000010 0.000010 0.406851 0.000010
## 311 SOC 1066 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 312 SOC 1067 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 313 SOC 1068 0.00001 0.029296 0.000176 0.000010 0.000552 0.339113 0.000010
## 314 SOC 1069 0.00001 0.056695 0.000010 0.023968 0.008836 0.459668 0.035294
## 315 SOC 1070 0.00001 0.087083 0.013070 0.000010 0.000010 0.250427 0.004795
##           v8       v9      v10      v11      v12      v13      v14 Latitude
## 310 0.214582 0.025094 0.007646 0.000010 0.000013 0.293928 0.000010 43.60042
## 311 0.109817 0.000010 0.000010 0.013705 0.000010 0.876368 0.000010 43.60042
## 312 0.914203 0.000010 0.000010 0.000010 0.000010 0.085677 0.000010 43.60042
## 313 0.017951 0.005221 0.034125 0.000010 0.026843 0.523743 0.022941 43.60042
## 314 0.030823 0.000010 0.005126 0.000010 0.007622 0.371918 0.000010 43.60042
## 315 0.193435 0.033125 0.000010 0.000010 0.000010 0.402723 0.015282 43.60042
##     Longitude Pop_City Country         Region   Subregion Year order
## 310  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 311  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 312  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 313  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 314  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38
## 315  39.74533    Sochi  Russia Eastern Europe East Europe 2021    38

Q-values for k=14

make a palette with 14 colors

colors2 <-c(
      "v1" = "green",
      "v2" = "yellow2", 
      "v3" = "#B22222", 
      "v4" = "#1E90FF",
      "v5" = "blue",
      "v6" = "#B20CD9",
      "v7" = "#FF8C1A",
      "v8" = "purple4",
      "v9" = "#008080",
      "v10" = "#F49AC2",
      "v11" = "green4",
      "v12" = "chocolate4",
      "v13" = "purple",
      "v14" = "#77DD77"
      )

Make pie plot

world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)

#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = NA) +
  geom_text_repel(data = df_mean,
                  aes(x = Longitude, y = Latitude, label = pop), 
                  size = 3, 
                  box.padding = unit(0.5, "lines"),
                  max.overlaps = 50) +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k14_pie.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)

selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
#  filter(admin %in% countries_with_data)

# Calculate mean proportions for each population
df_mean <- df1 |>
  group_by(pop) |>
  summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)), 
            Longitude = mean(Longitude),
            Latitude = mean(Latitude))

source(
  here(
    "analyses", "my_theme2.R"
  )
)

ggplot() +
  geom_sf(data = selected_countries, fill="white") +
  geom_scatterpie(data = df_mean, 
                  aes(x = Longitude, y = Latitude, r = 1.5), 
                  cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = "black") +
  scale_fill_manual(values = colors2) +
  guides(fill = "none") +  # Hide legend
  # coord_sf() +
  coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
  my_theme()

# # 
ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k14_pie_outlined.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

Europe_structure_plots

Margaret Corley

2024-04-18

1. Load Packages

2. Sample locations

fastStructure

3. Set 2: r2<0.1 dataset from fastStructure simple

3.1 Structure plots for fastStructure data (r2<0.1)

3.1.1 k=15

3.1.2 k=13

3.1.3 k=18

3.1.4 k=5

3.1.5 k=6

3.2 Pie-charts for r2<0.1 fastStructure

3.2.1. Import the Q matrix (K15 for fastStructure)

3.2.2 Q-values

4. Set 1: r2<0.01 dataset from fastStructure simple

4.1 Structure plots for fastStructure data (r2<0.01)

4.1.1 k=15

4.1.2 k=13

4.1.3 k=18

4.1.4 k=5

4.1.5 k=6

4.2 Pie-charts for r2<0.01 fastStructure

4.2.1 Import the Q matrix (K15 for fastStructure)

4.2.2 Q-values

5. Set 3: MAF 1% (r2<0.01) dataset from fastStructure simple

5.1 Structure plots for fastStructure data (r2<0.01)

5.1.1 k=15

5.1.2 k=13

5.1.3 k=18

5.1.4 k=5

5.1.5 k=6

5.1.6 k=20

5.2 Pie-charts for MAF 1% r2<0.01 fastStructure

5.2.1 k=15 map

5.2.2 k=5 map

5.2.3 k=18 map

5.2.4 k=14 map

5.2.5 k=13 map

5.3 Maps by subregion for MAF 1% (r2<0.01)

5.3.1.1 zoom in on Italy

5.3.1.2 zoom in on Eastern Europe

5.3.1.3 zoom in on Iberian peninsula

5.3.1.4 zoom in on Balkans

5.3.2 k=14

5.3.2.1 zoom in on Italy

5.3.2.2 zoom in on Eastern Europe

5.3.2.3 zoom in on Iberian peninsula

5.3.2.4 zoom in on Balkans

5.3.3 k=13

5.3.3.1 zoom in on Italy

5.3.3.2 zoom in on Eastern Europe

5.3.3.3 zoom in on Iberian peninsula

5.3.3.4 zoom in on Balkans

5.3.4 k=18

5.3.4.1 zoom in on Italy

5.3.4.2 zoom in on Eastern Europe

5.3.4.3 zoom in on Iberian peninsula

5.3.3.4 zoom in on Balkans

Admixture

6. Set 2: r2<0.1 dataset from admixture

6.1 Admixture r2<0.1 structure plots

6.1.1 k=15

6.1.2 k=5

6.1.3 k=6

7. Set 1: r2<0.01 dataset from admixture

7.1 Admixture r2<0.01 structure plots

7.1.1 k=15

7.1.2 k=5

7.1.3 k=6

7.1.4 k=13

8. Set 3: MAF 1% (r2<0.01) dataset from admixture

8.1 Admixture MAF 1% r2<0.01 structure plots

8.1.1 k=15

8.1.2 k=5

8.1.3 k=6

8.1.4 k=13

8.1.5 k=14

8.1.6 k=18