Load the standard packages
#install.packages(c("raster", "sf", "tidyverse", "fields", "downloader"))
library(raster) # important to load before tidyverse, otherwise it masks select()
library(tidyverse)
library(scatterpie)
library(sf)
library(ggspatial)
library(ggplot2)
library(dplyr)
library(colorout)
library(here)
library(extrafont)
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)
library(ggrepel)
library(Cairo)
Import samples attributes
#data<-read.csv("sampling_loc_all.csv", stringsAsFactors = TRUE)
#write_rds(data, "sampling_loc_all.rds")
sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 9
## 2 West Europe 2019 10
## 3 West Europe 2017 11
## 4 West Europe 2017 12
## 5 West Europe 2018 13
## 6 West Europe 2017 14
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run73 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run073/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k15run73)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000001 0.000001 1.59e-1 1e-6 1e-6 1 e-6 0.841 1 e-6 1e-6 1e-6 1e-6
## 2 0.000001 0.000001 1 e-6 1e-6 1e-6 1 e-6 1.00 1 e-6 1e-6 1e-6 1e-6
## 3 0.000001 0.000001 9.24e-1 1e-6 1e-6 1 e-6 0.0757 1 e-6 1e-6 1e-6 1e-6
## 4 0.000001 0.000001 3.35e-3 1e-6 1e-6 1 e-6 0.996 7.92e-4 1e-6 1e-6 1e-6
## 5 0.000001 0.000001 5.06e-3 1e-6 1e-6 1 e-6 0.995 1 e-6 1e-6 1e-6 1e-6
## 6 0.0783 0.000001 1.20e-1 1e-6 1e-6 9.94e-3 0.791 1 e-6 1e-6 1e-6 1e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
## X10 X11 X12 X13 X14 X15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
Rename the columns
# Rename the columns starting from the third one
k15run73 <- k15run73 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run73)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
## v10 v11 v12 v13 v14 v15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k15run73 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"blue",
"#1E90FF",
"chocolate4",
"green",
"#75FAFF",
"#B20CD9",
"purple4",
"#F49AC2",
"yellow2",
"#FFFF99",
"#B22222",
"#FFB347",
"#77DD77",
"#FF8C1A",
"purple"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n FastStructure for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run11 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run011/simple.13.meanQ"
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k13run11)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.209 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 7.91e-1 1 e-6 1 e-6
## 2 0.741 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.59e-1 1 e-6 1 e-6
## 3 1.00 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6 1 e-6 1 e-6
## 4 0.250 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 7.50e-1 1 e-6 1 e-6
## 5 0.175 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 8.06e-1 1 e-6 1.87e-2
## 6 0.354 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 6.30e-1 1.60e-2 1 e-6
## # ℹ 1 more variable: X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 0.208893 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.791098
## 2 1066 SOC 0.741139 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.258852
## 3 1067 SOC 0.999990 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001
## 4 1068 SOC 0.249965 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.750026
## 5 1069 SOC 0.175186 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.806057
## 6 1070 SOC 0.354239 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.629777
## X11 X12 X13
## 1 0.000001 0.000001 1e-06
## 2 0.000001 0.000001 1e-06
## 3 0.000001 0.000001 1e-06
## 4 0.000001 0.000001 1e-06
## 5 0.000001 0.018748 1e-06
## 6 0.015976 0.000001 1e-06
Rename the columns
# Rename the columns starting from the third one
k13run11 <- k13run11 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run11)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 0.208893 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.791098
## 2 1066 SOC 0.741139 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.258852
## 3 1067 SOC 0.999990 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001
## 4 1068 SOC 0.249965 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.750026
## 5 1069 SOC 0.175186 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.806057
## 6 1070 SOC 0.354239 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.629777
## v11 v12 v13
## 1 0.000001 0.000001 1e-06
## 2 0.000001 0.000001 1e-06
## 3 0.000001 0.000001 1e-06
## 4 0.000001 0.000001 1e-06
## 5 0.000001 0.018748 1e-06
## 6 0.015976 0.000001 1e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k13run11 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#77DD77",
"yellow2",
"#75FAFF",
"#B22222",
"chocolate4",
"#008080",
"green",
"purple",
"purple4",
"#FF8C1A",
"blue",
"#F49AC2",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n FastStructure for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k18run22 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run022/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k18run22)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1e-6 1e-6 1.01e-2 1e-6 1e-6 1e-6 5.39e-1 1e-6 1e-6 1 e-6 1e-6 1e-6
## 2 1e-6 1e-6 1 e-6 1e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6 1e-6
## 3 1e-6 1e-6 1 e-6 1e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 3.15e-1 1e-6 1e-6
## 4 1e-6 1e-6 1 e-6 1e-6 1e-6 1e-6 4.04e-1 1e-6 1e-6 1 e-6 1e-6 1e-6
## 5 1e-6 1e-6 1 e-6 1e-6 1e-6 1e-6 5.64e-1 1e-6 1e-6 9.88e-2 1e-6 1e-6
## 6 1e-6 1e-6 6.38e-2 1e-6 1e-6 1e-6 5.22e-1 1e-6 1e-6 1.22e-1 1e-6 1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 1e-06 1e-06 0.010103 1e-06 1e-06 1e-06 0.538619 1e-06 1e-06 0.000001
## 2 1066 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001
## 3 1067 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.314543
## 4 1068 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.404441 1e-06 1e-06 0.000001
## 5 1069 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.564054 1e-06 1e-06 0.098750
## 6 1070 SOC 1e-06 1e-06 0.063819 1e-06 1e-06 1e-06 0.522484 1e-06 1e-06 0.121619
## X11 X12 X13 X14 X15 X16 X17 X18
## 1 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.451269 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.999990 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.685447 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 0.025585 1e-06 0.569965 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.337187 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.292070 1e-06
Rename the columns
# Rename the columns starting from the third one
k18run22 <- k18run22 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k18run22)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 1e-06 1e-06 0.010103 1e-06 1e-06 1e-06 0.538619 1e-06 1e-06 0.000001
## 2 1066 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001
## 3 1067 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.314543
## 4 1068 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.404441 1e-06 1e-06 0.000001
## 5 1069 SOC 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06 0.564054 1e-06 1e-06 0.098750
## 6 1070 SOC 1e-06 1e-06 0.063819 1e-06 1e-06 1e-06 0.522484 1e-06 1e-06 0.121619
## v11 v12 v13 v14 v15 v16 v17 v18
## 1 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.451269 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.999990 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.685447 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 0.025585 1e-06 0.569965 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.337187 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 0.292070 1e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k18run22 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FF8C1A",
"blue",
"#008080",
"#77DD77",
"purple",
"#F49AC2",
"green",
"#B22222",
"chocolate4",
"#B20CD9",
"green4",
"purple4",
"#FFFF99",
"#75FAFF",
"orchid",
"#1E90FF",
"#FFB347",
"yellow"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n FastStructure for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run4)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0165 0.000002 0.984 0.000002 0.000002
## 2 0.000002 0.000002 1.00 0.000002 0.000002
## 3 0.000002 0.000002 1.00 0.000002 0.000002
## 4 0.000002 0.000002 0.954 0.0463 0.000002
## 5 0.000002 0.000002 0.933 0.0674 0.000002
## 6 0.000002 0.000002 0.919 0.0813 0.000002
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 0.016451 2e-06 0.983543 0.000002 2e-06
## 2 1066 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 3 1067 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 4 1068 SOC 0.000002 2e-06 0.953741 0.046252 2e-06
## 5 1069 SOC 0.000002 2e-06 0.932638 0.067356 2e-06
## 6 1070 SOC 0.000002 2e-06 0.918655 0.081339 2e-06
Rename the columns
# Rename the columns starting from the third one
k5run4 <- k5run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run4)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 0.016451 2e-06 0.983543 0.000002 2e-06
## 2 1066 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 3 1067 SOC 0.000002 2e-06 0.999991 0.000002 2e-06
## 4 1068 SOC 0.000002 2e-06 0.953741 0.046252 2e-06
## 5 1069 SOC 0.000002 2e-06 0.932638 0.067356 2e-06
## 6 1070 SOC 0.000002 2e-06 0.918655 0.081339 2e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k5run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FFFF19",
"purple3",
"#FF8C1A",
"#1E90FF",
"#77DD37"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n FastStructure for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k6run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run004/simple.6.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k6run4)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 0.000002 0.978 0.000002 0.0219 0.000002
## 2 0.000002 0.000002 1.00 0.000002 0.000002 0.000002
## 3 0.000002 0.000002 1.00 0.000002 0.000002 0.000002
## 4 0.000002 0.000002 0.934 0.000002 0.0657 0.000002
## 5 0.000002 0.000002 0.919 0.000002 0.0808 0.000002
## 6 0.000002 0.000002 0.902 0.0112 0.0865 0.000002
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1065 SOC 2e-06 2e-06 0.978077 0.000002 0.021916 2e-06
## 2 1066 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 3 1067 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 4 1068 SOC 2e-06 2e-06 0.934298 0.000002 0.065695 2e-06
## 5 1069 SOC 2e-06 2e-06 0.919204 0.000002 0.080789 2e-06
## 6 1070 SOC 2e-06 2e-06 0.902340 0.011192 0.086463 2e-06
Rename the columns
# Rename the columns starting from the third one
k6run4 <- k6run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k6run4)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1065 SOC 2e-06 2e-06 0.978077 0.000002 0.021916 2e-06
## 2 1066 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 3 1067 SOC 2e-06 2e-06 0.999991 0.000002 0.000002 2e-06
## 4 1068 SOC 2e-06 2e-06 0.934298 0.000002 0.065695 2e-06
## 5 1069 SOC 2e-06 2e-06 0.919204 0.000002 0.080789 2e-06
## 6 1070 SOC 2e-06 2e-06 0.902340 0.011192 0.086463 2e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k6run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple3",
"#1E90FF",
"#FF8C1A",
"#77DD37",
"red",
"#FFFF19"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n FastStructure for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run73 <- read_delim(
here(
"output",
"europe",
"admixture",
"r_1_run073_simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k15run73 <- k15run73 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k15run73)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000001 0.000001 1.59e-1 1e-6 1e-6 1 e-6 0.841 1 e-6 1e-6 1e-6 1e-6
## 2 0.000001 0.000001 1 e-6 1e-6 1e-6 1 e-6 1.00 1 e-6 1e-6 1e-6 1e-6
## 3 0.000001 0.000001 9.24e-1 1e-6 1e-6 1 e-6 0.0757 1 e-6 1e-6 1e-6 1e-6
## 4 0.000001 0.000001 3.35e-3 1e-6 1e-6 1 e-6 0.996 7.92e-4 1e-6 1e-6 1e-6
## 5 0.000001 0.000001 5.06e-3 1e-6 1e-6 1 e-6 0.995 1 e-6 1e-6 1e-6 1e-6
## 6 0.0783 0.000001 1.20e-1 1e-6 1e-6 9.94e-3 0.791 1 e-6 1e-6 1e-6 1e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
## X10 X11 X12 X13 X14 X15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
Rename the columns
# Rename the columns starting from the third one
k15run73 <- k15run73 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run73)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001 1e-06
## 3 1067 SOC 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001 1e-06
## 4 1068 SOC 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792 1e-06
## 5 1069 SOC 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001 1e-06
## 6 1070 SOC 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001 1e-06
## v10 v11 v12 v13 v14 v15
## 1 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
Merge with pops
# Add an index column to Q_tibble
k15run73$index <- seq_len(nrow(k15run73))
# Perform the merge as before
df1 <-
merge(
k15run73,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8
## 310 SOC 1065 0.000001 1e-06 0.158884 1e-06 1e-06 0.000001 0.841107 0.000001
## 311 SOC 1066 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001 0.999990 0.000001
## 312 SOC 1067 0.000001 1e-06 0.924306 1e-06 1e-06 0.000001 0.075685 0.000001
## 313 SOC 1068 0.000001 1e-06 0.003355 1e-06 1e-06 0.000001 0.995845 0.000792
## 314 SOC 1069 0.000001 1e-06 0.005055 1e-06 1e-06 0.000001 0.994935 0.000001
## 315 SOC 1070 0.078274 1e-06 0.120436 1e-06 1e-06 0.009938 0.791345 0.000001
## v9 v10 v11 v12 v13 v14 v15 Latitude Longitude Pop_City
## 310 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042 39.74533 Sochi
## 311 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042 39.74533 Sochi
## 312 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042 39.74533 Sochi
## 313 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042 39.74533 Sochi
## 314 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042 39.74533 Sochi
## 315 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 43.60042 39.74533 Sochi
## Country Region Subregion Year order
## 310 Russia Eastern Europe East Europe 2021 38
## 311 Russia Eastern Europe East Europe 2021 38
## 312 Russia Eastern Europe East Europe 2021 38
## 313 Russia Eastern Europe East Europe 2021 38
## 314 Russia Eastern Europe East Europe 2021 38
## 315 Russia Eastern Europe East Europe 2021 38
make a palette with 15 colors
colors2 <-c(
"v1" = "#FF8C1A",
"v2" = "#FFFF99",
"v3" = "purple4",
"v4" = "#F49AC2",
"v5" = "#77DD77",
"v6" = "yellow2",
"v7" = "purple",
"v8" = "#B20CC9",
"v9" = "chocolate4",
"v10" = "blue",
"v11" = "green",
"v12" = "#FFB347",
"v13" = "#1E90FF",
"v14" = "#75FAFF",
"v15" = "#B22222"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#FF8C1A" "#FFFF99" "purple4" "#F49AC2" "#77DD77" "yellow2"
## v7 v8 v9 v10 v11 v12
## "purple" "#B20CC9" "chocolate4" "blue" "green" "#FFB347"
## v13 v14 v15
## "#1E90FF" "#75FAFF" "#B22222"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run09 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run009/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k15run09)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 0.351 2e-6 2e-6 9.94e-2 5.25e-1 2e-6 2e-6 2 e-6 2e-6 2e-6
## 2 0.000002 1.00 2e-6 2e-6 2 e-6 2 e-6 2e-6 2e-6 2 e-6 2e-6 2e-6
## 3 0.000002 0.000002 2e-6 2e-6 1.00e+0 2 e-6 2e-6 2e-6 2 e-6 2e-6 2e-6
## 4 0.000002 0.637 2e-6 2e-6 2 e-6 3.24e-1 2e-6 2e-6 2 e-6 2e-6 2e-6
## 5 0.000002 0.388 2e-6 2e-6 2 e-6 5.67e-1 2e-6 2e-6 2 e-6 2e-6 2e-6
## 6 0.000002 0.588 2e-6 2e-6 2.02e-1 2 e-6 2e-6 2e-6 1.59e-1 2e-6 2e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
## X10 X11 X12 X13 X14 X15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002
Rename the columns
# Rename the columns starting from the third one
k15run09 <- k15run09 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run09)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
## v10 v11 v12 v13 v14 v15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k15run09 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple4",
"#B20CD9",
"#B22222",
"purple",
"#75FAFF",
"#FFB347",
"#008080",
"yellow2",
"blue",
"#F49AC2",
"#77DD77",
"green",
"orchid",
"#1E90FF",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n FastStructure for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run41 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run041/simple.13.meanQ"
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k13run41)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 0.0629 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 7.83e-1 2e-6 2e-6
## 2 0.000002 0.000002 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 8.81e-1 2e-6 2e-6
## 3 0.000002 0.000002 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6 2e-6
## 4 0.0409 0.000002 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 9.59e-1 2e-6 2e-6
## 5 0.000002 0.0855 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 9.14e-1 2e-6 2e-6
## 6 0.000002 0.122 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 7.37e-1 2e-6 2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 0.000002 0.062911 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.782510 2e-06
## 2 1066 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.881126 2e-06
## 3 1067 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.040936 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.959039 2e-06
## 5 1069 SOC 0.000002 0.085520 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.914454 2e-06
## 6 1070 SOC 0.000002 0.121511 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.736631 2e-06
## X11 X12 X13
## 1 2e-06 0.154556 2e-06
## 2 2e-06 0.118848 2e-06
## 3 2e-06 0.999972 2e-06
## 4 2e-06 0.000002 2e-06
## 5 2e-06 0.000002 2e-06
## 6 2e-06 0.141834 2e-06
Rename the columns
# Rename the columns starting from the third one
k13run41 <- k13run41 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run41)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 0.000002 0.062911 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.782510 2e-06
## 2 1066 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.881126 2e-06
## 3 1067 SOC 0.000002 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.040936 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.959039 2e-06
## 5 1069 SOC 0.000002 0.085520 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.914454 2e-06
## 6 1070 SOC 0.000002 0.121511 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.736631 2e-06
## v11 v12 v13
## 1 2e-06 0.154556 2e-06
## 2 2e-06 0.118848 2e-06
## 3 2e-06 0.999972 2e-06
## 4 2e-06 0.000002 2e-06
## 5 2e-06 0.000002 2e-06
## 6 2e-06 0.141834 2e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k13run41 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FF8C1A",
"blue",
"#77DD77",
"#B20CD9",
"#B22222",
"purple4",
"green",
"#008080",
"chocolate4",
"#F49AC2",
"#1E90FF",
"purple",
"yellow2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n FastStructure for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k18run35 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run035/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k18run35)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2e-6 2e-6 2 e-6 2e-6 5.28e-1 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6
## 2 2e-6 2e-6 2 e-6 2e-6 9.54e-1 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6
## 3 2e-6 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6
## 4 2e-6 2e-6 2 e-6 2e-6 6.98e-1 2e-6 2e-6 2e-6 2e-6 2e-6 3.75e-2 2e-6
## 5 2e-6 2e-6 6.37e-2 2e-6 4.69e-1 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6
## 6 2e-6 2e-6 2 e-6 2e-6 7.11e-1 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 2e-06 2e-06 0.000002 2e-06 0.527816 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 2e-06 0.000002 2e-06 0.953589 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 2e-06 0.000002 2e-06 0.698323 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 2e-06 0.063682 2e-06 0.468962 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 2e-06 0.000002 2e-06 0.710877 2e-06 2e-06 2e-06 2e-06 2e-06
## X11 X12 X13 X14 X15 X16 X17 X18
## 1 0.000002 2e-06 2e-06 2e-06 2e-06 0.110473 2e-06 0.361686
## 2 0.000002 2e-06 2e-06 2e-06 2e-06 0.046384 2e-06 0.000002
## 3 0.000002 2e-06 2e-06 2e-06 2e-06 0.999972 2e-06 0.000002
## 4 0.037455 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.264197
## 5 0.000002 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.467332
## 6 0.000002 2e-06 2e-06 2e-06 2e-06 0.289096 2e-06 0.000002
Rename the columns
# Rename the columns starting from the third one
k18run35 <- k18run35 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k18run35)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 2e-06 2e-06 0.000002 2e-06 0.527816 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 2e-06 0.000002 2e-06 0.953589 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 2e-06 0.000002 2e-06 0.698323 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 2e-06 0.063682 2e-06 0.468962 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 2e-06 0.000002 2e-06 0.710877 2e-06 2e-06 2e-06 2e-06 2e-06
## v11 v12 v13 v14 v15 v16 v17 v18
## 1 0.000002 2e-06 2e-06 2e-06 2e-06 0.110473 2e-06 0.361686
## 2 0.000002 2e-06 2e-06 2e-06 2e-06 0.046384 2e-06 0.000002
## 3 0.000002 2e-06 2e-06 2e-06 2e-06 0.999972 2e-06 0.000002
## 4 0.037455 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.264197
## 5 0.000002 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06 0.467332
## 6 0.000002 2e-06 2e-06 2e-06 2e-06 0.289096 2e-06 0.000002
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k18run35 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"yellow2",
"#B20CD9",
"#B22222",
"#FFB347",
"#F49AC2",
"#FFFF99",
"#75FAFF",
"blue",
"green4",
"green",
"#FF8C1A",
"#77DD77",
"#008080",
"orchid",
"goldenrod",
"purple",
"#1E90FF",
"purple4"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n FastStructure for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run4)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000006 0.000006 0.0728 0.000006 0.927
## 2 0.000006 0.000006 0.000006 0.000006 1.00
## 3 0.000006 0.000006 0.000006 0.000006 1.00
## 4 0.120 0.000006 0.0220 0.000006 0.858
## 5 0.114 0.000006 0.0497 0.000006 0.837
## 6 0.0746 0.000006 0.0772 0.000006 0.848
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 0.000006 6e-06 0.072818 6e-06 0.927163
## 2 1066 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 3 1067 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 4 1068 SOC 0.119502 6e-06 0.022025 6e-06 0.858461
## 5 1069 SOC 0.113794 6e-06 0.049666 6e-06 0.836528
## 6 1070 SOC 0.074577 6e-06 0.077227 6e-06 0.848184
Rename the columns
# Rename the columns starting from the third one
k5run4 <- k5run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run4)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 0.000006 6e-06 0.072818 6e-06 0.927163
## 2 1066 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 3 1067 SOC 0.000006 6e-06 0.000006 6e-06 0.999975
## 4 1068 SOC 0.119502 6e-06 0.022025 6e-06 0.858461
## 5 1069 SOC 0.113794 6e-06 0.049666 6e-06 0.836528
## 6 1070 SOC 0.074577 6e-06 0.077227 6e-06 0.848184
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k5run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#1E90FF",
"#FFFF19",
"purple3",
"#77DD37",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n FastStructure for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k6run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run004/simple.6.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k6run4)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.947 0.0533 0.000005 0.000005 0.000005 0.000005
## 2 1.00 0.000005 0.000005 0.000005 0.000005 0.000005
## 3 1.00 0.000005 0.000005 0.000005 0.000005 0.000005
## 4 0.903 0.0567 0.0407 0.000005 0.000005 0.000005
## 5 0.851 0.00334 0.0308 0.115 0.000005 0.000005
## 6 0.887 0.00293 0.110 0.000005 0.000005 0.000005
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1065 SOC 0.946689 0.053291 0.000005 0.000005 5e-06 5e-06
## 2 1066 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 3 1067 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 4 1068 SOC 0.902567 0.056717 0.040701 0.000005 5e-06 5e-06
## 5 1069 SOC 0.850611 0.003335 0.030846 0.115197 5e-06 5e-06
## 6 1070 SOC 0.886852 0.002929 0.110204 0.000005 5e-06 5e-06
Rename the columns
# Rename the columns starting from the third one
k6run4 <- k6run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k6run4)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1065 SOC 0.946689 0.053291 0.000005 0.000005 5e-06 5e-06
## 2 1066 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 3 1067 SOC 0.999975 0.000005 0.000005 0.000005 5e-06 5e-06
## 4 1068 SOC 0.902567 0.056717 0.040701 0.000005 5e-06 5e-06
## 5 1069 SOC 0.850611 0.003335 0.030846 0.115197 5e-06 5e-06
## 6 1070 SOC 0.886852 0.002929 0.110204 0.000005 5e-06 5e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k6run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#77DD37",
"#FF8C1A",
"#1E90FF",
"#FFFF19",
"purple3",
"red"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n FastStructure for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run09 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run009/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k15run09 <- k15run09 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k15run09)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 0.351 2e-6 2e-6 9.94e-2 5.25e-1 2e-6 2e-6 2 e-6 2e-6 2e-6
## 2 0.000002 1.00 2e-6 2e-6 2 e-6 2 e-6 2e-6 2e-6 2 e-6 2e-6 2e-6
## 3 0.000002 0.000002 2e-6 2e-6 1.00e+0 2 e-6 2e-6 2e-6 2 e-6 2e-6 2e-6
## 4 0.000002 0.637 2e-6 2e-6 2 e-6 3.24e-1 2e-6 2e-6 2 e-6 2e-6 2e-6
## 5 0.000002 0.388 2e-6 2e-6 2 e-6 5.67e-1 2e-6 2e-6 2 e-6 2e-6 2e-6
## 6 0.000002 0.588 2e-6 2e-6 2.02e-1 2 e-6 2e-6 2e-6 1.59e-1 2e-6 2e-6
## # ℹ 4 more variables: X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
## X10 X11 X12 X13 X14 X15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002
Rename the columns
# Rename the columns starting from the third one
k15run09 <- k15run09 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run09)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 2 1066 SOC 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 5 1069 SOC 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 6 1070 SOC 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
## v10 v11 v12 v13 v14 v15
## 1 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002
## 2 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 3 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002
## 4 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298
## 5 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002
## 6 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002
Merge with pops
# Add an index column to Q_tibble
k15run09$index <- seq_len(nrow(k15run09))
# Perform the merge as before
df1 <-
merge(
k15run09,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9
## 310 SOC 1065 2e-06 0.351212 2e-06 2e-06 0.099439 0.524848 2e-06 2e-06 0.000002
## 311 SOC 1066 2e-06 0.999972 2e-06 2e-06 0.000002 0.000002 2e-06 2e-06 0.000002
## 312 SOC 1067 2e-06 0.000002 2e-06 2e-06 0.999972 0.000002 2e-06 2e-06 0.000002
## 313 SOC 1068 2e-06 0.636724 2e-06 2e-06 0.000002 0.323955 2e-06 2e-06 0.000002
## 314 SOC 1069 2e-06 0.387571 2e-06 2e-06 0.000002 0.567231 2e-06 2e-06 0.000002
## 315 SOC 1070 2e-06 0.588064 2e-06 2e-06 0.202269 0.000002 2e-06 2e-06 0.159462
## v10 v11 v12 v13 v14 v15 Latitude Longitude Pop_City
## 310 2e-06 2e-06 0.000002 2e-06 0.024479 0.000002 43.60042 39.74533 Sochi
## 311 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002 43.60042 39.74533 Sochi
## 312 2e-06 2e-06 0.000002 2e-06 0.000002 0.000002 43.60042 39.74533 Sochi
## 313 2e-06 2e-06 0.000002 2e-06 0.000002 0.039298 43.60042 39.74533 Sochi
## 314 2e-06 2e-06 0.045174 2e-06 0.000002 0.000002 43.60042 39.74533 Sochi
## 315 2e-06 2e-06 0.050183 2e-06 0.000002 0.000002 43.60042 39.74533 Sochi
## Country Region Subregion Year order
## 310 Russia Eastern Europe East Europe 2021 38
## 311 Russia Eastern Europe East Europe 2021 38
## 312 Russia Eastern Europe East Europe 2021 38
## 313 Russia Eastern Europe East Europe 2021 38
## 314 Russia Eastern Europe East Europe 2021 38
## 315 Russia Eastern Europe East Europe 2021 38
make a palette with 15 colors
colors2 <-c(
"v1" = "#F49AC2",
"v2" = "#B20CC9",
"v3" = "#77DD77",
"v4" = "green",
"v5" = "purple4",
"v6" = "purple",
"v7" = "#1E90FF",
"v8" = "#FFB347",
"v9" = "#FF8C1A",
"v10" = "blue",
"v11" = "#75FAFF",
"v12" = "yellow2",
"v13" = "#B22222",
"v14" = "#008080",
"v15" = "orchid"
)
colors2
## v1 v2 v3 v4 v5 v6 v7 v8
## "#F49AC2" "#B20CC9" "#77DD77" "green" "purple4" "purple" "#1E90FF" "#FFB347"
## v9 v10 v11 v12 v13 v14 v15
## "#FF8C1A" "blue" "#75FAFF" "yellow2" "#B22222" "#008080" "orchid"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
my_theme()
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run07 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run007/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k15run07)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 2 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 3 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 4 0.000002 0.927 0.0732 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 5 0.000002 0.996 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 6 0.000002 0.999 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## X11 X12 X13 X14 X15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742
Rename the columns
# Rename the columns starting from the third one
k15run07 <- k15run07 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run07)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## v11 v12 v13 v14 v15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k15run07 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"green",
"green4",
"#1E90FF",
"#75FAFF",
"purple",
"#FF8C1A",
"#B22222",
"#FFB347",
"blue",
"#008080",
"chocolate4",
"purple4",
"#77DD77",
"#F49AC2",
"yellow2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n FastStructure for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run8 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run008/simple.13.meanQ"
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k13run8)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.546 0.443 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 1.17e-2 2 e-6 2e-6
## 2 0.000002 1.00 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 3 0.000002 1.00 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 4 0.000002 0.843 2e-6 2 e-6 2e-6 1.57e-1 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 5 0.483 0.475 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 4.20e-2 2 e-6 2e-6
## 6 0.313 0.577 2e-6 2.64e-2 2e-6 2 e-6 2e-6 2e-6 2 e-6 8.40e-2 2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
## X10 X11 X12 X13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06
Rename the columns
# Rename the columns starting from the third one
k13run8 <- k13run8 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run8)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
## v10 v11 v12 v13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k13run8 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#1E90FF",
"#008080",
"blue",
"#FF8C1A",
"chocolate4",
"yellow2",
"purple4",
"purple",
"green",
"#B22222",
"#008080",
"#B20CD9",
"#F49AC2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n FastStructure for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k18run2 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run002/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k18run2)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1e-6 1e-6 1e-6 1e-6 2.77e-1 1e-6 1e-6 6.23e-1 1e-6 1e-6 1 e-6 1e-6
## 2 1e-6 1e-6 1e-6 1e-6 1.00e+0 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6
## 3 1e-6 1e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6
## 4 1e-6 1e-6 1e-6 1e-6 5.60e-1 1e-6 1e-6 4.33e-1 1e-6 1e-6 6.86e-3 1e-6
## 5 1e-6 1e-6 1e-6 1e-6 2.69e-1 1e-6 1e-6 7.31e-1 1e-6 1e-6 1 e-6 1e-6
## 6 1e-6 1e-6 1e-6 1e-6 4.14e-1 1e-6 1e-6 3.85e-1 1e-6 1e-6 1 e-6 1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## X11 X12 X13 X14 X15 X16 X17 X18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06
Rename the columns
# Rename the columns starting from the third one
k18run2 <- k18run2 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k18run2)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k18run2 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"green",
"#77DD77",
"blue",
"yellow2",
"#1E90FF",
"green4",
"purple4",
"#F49AC2",
"#FFB347",
"orchid",
"purple",
"chocolate4",
"#B22222",
"#75FAFF",
"#FF8C1A",
"#B20CD9",
"#FFFF99",
"#008080"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n FastStructure for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run4)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000005 0.000005 0.0292 0.971 0.000005
## 2 0.000005 0.000005 0.000005 1.00 0.000005
## 3 0.000005 0.000005 0.000005 1.00 0.000005
## 4 0.000005 0.104 0.00251 0.894 0.000005
## 5 0.000005 0.101 0.000005 0.871 0.0275
## 6 0.000005 0.0316 0.000005 0.889 0.0796
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650
Rename the columns
# Rename the columns starting from the third one
k5run4 <- k5run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run4)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k5run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FFFF19",
"purple3",
"#1E90FF",
"#FF8C1A",
"#77DD37"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n FastStructure for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k6run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.6.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k6run4)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000004 0.000004 0.950 0.0503 0.000004 0.000004
## 2 0.000004 0.000004 1.00 0.000004 0.000004 0.000004
## 3 0.000004 0.000004 1.00 0.000004 0.000004 0.000004
## 4 0.0835 0.000004 0.897 0.0196 0.000075 0.000004
## 5 0.000004 0.000004 0.927 0.0728 0.000004 0.000004
## 6 0.0653 0.000004 0.878 0.0569 0.000004 0.000004
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1065 SOC 0.000004 4e-06 0.949654 0.050330 4.0e-06 4e-06
## 2 1066 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 3 1067 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 4 1068 SOC 0.083468 4e-06 0.896836 0.019613 7.5e-05 4e-06
## 5 1069 SOC 0.000004 4e-06 0.927134 0.072850 4.0e-06 4e-06
## 6 1070 SOC 0.065259 4e-06 0.877866 0.056863 4.0e-06 4e-06
Rename the columns
# Rename the columns starting from the third one
k6run4 <- k6run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k6run4)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1065 SOC 0.000004 4e-06 0.949654 0.050330 4.0e-06 4e-06
## 2 1066 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 3 1067 SOC 0.000004 4e-06 0.999979 0.000004 4.0e-06 4e-06
## 4 1068 SOC 0.083468 4e-06 0.896836 0.019613 7.5e-05 4e-06
## 5 1069 SOC 0.000004 4e-06 0.927134 0.072850 4.0e-06 4e-06
## 6 1070 SOC 0.065259 4e-06 0.877866 0.056863 4.0e-06 4e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k6run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple3",
"#FF8C1A",
"#FFFF19",
"#1E90FF",
"red",
"#77DD37"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n FastStructure for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k20run13 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run013/simple.20.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k20run13)
## # A tibble: 6 × 20
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6 1.88e-1
## 2 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6 2.36e-4
## 3 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6 1.00e+0
## 4 1e-6 1.10e-3 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.98e-2 1 e-6
## 5 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6 1 e-6
## 6 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6 1.49e-1
## # ℹ 8 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>, X19 <dbl>, X20 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 1e-06 0.001097 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## X11 X12 X13 X14 X15 X16 X17 X18 X19 X20
## 1 0.000001 0.188189 1e-06 0.811789 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 2 0.000001 0.000236 1e-06 0.999742 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 3 0.000001 0.999977 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 4 0.029811 0.000001 1e-06 0.950145 1e-06 1e-06 0.018928 1e-06 1e-06 1e-06
## 5 0.000001 0.000001 1e-06 0.951559 1e-06 1e-06 0.048419 1e-06 1e-06 1e-06
## 6 0.000001 0.149064 1e-06 0.479014 1e-06 1e-06 0.371902 1e-06 1e-06 1e-06
Rename the columns
# Rename the columns starting from the third one
k20run13 <- k20run13 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k20run13)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 1e-06 0.001097 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18 v19 v20
## 1 0.000001 0.188189 1e-06 0.811789 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 2 0.000001 0.000236 1e-06 0.999742 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 3 0.000001 0.999977 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06 1e-06
## 4 0.029811 0.000001 1e-06 0.950145 1e-06 1e-06 0.018928 1e-06 1e-06 1e-06
## 5 0.000001 0.000001 1e-06 0.951559 1e-06 1e-06 0.048419 1e-06 1e-06 1e-06
## 6 0.000001 0.149064 1e-06 0.479014 1e-06 1e-06 0.371902 1e-06 1e-06 1e-06
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
Melt data frame for plotting
# Melt the data frame for plotting
Q_melted <- k20run13 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple",
"yellow2",
"#B20CD9",
"#FFB347",
"#F49AC2",
"#B22222",
"orchid",
"goldenrod3",
"gray",
"#1E90FF",
"green4",
"#FFFF99",
"green",
"#008080",
"#75FAFF",
"chocolate4",
"#77DD77",
"purple4",
"#FF8C1A",
"blue")
# Generate all potential variable names
all_variables <- paste0("v", 1:20)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n FastStructure for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K15 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run07 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run007/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k15run07 <- k15run07 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k15run07)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 2 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 3 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 4 0.000002 0.927 0.0732 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 5 0.000002 0.996 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 6 0.000002 0.999 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## X11 X12 X13 X14 X15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742
Rename the columns
# Rename the columns starting from the third one
k15run07 <- k15run07 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run07)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## v11 v12 v13 v14 v15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742
Merge with pops
# Add an index column to Q_tibble
k15run07$index <- seq_len(nrow(k15run07))
# Perform the merge as before
df1 <-
merge(
k15run07,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 310 SOC 1065 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 311 SOC 1066 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 312 SOC 1067 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 313 SOC 1068 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 314 SOC 1069 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 315 SOC 1070 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## v11 v12 v13 v14 v15 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 311 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 312 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 313 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 314 2e-06 2e-06 2e-06 2e-06 0.004162 43.60042 39.74533 Sochi Russia
## 315 2e-06 2e-06 2e-06 2e-06 0.000742 43.60042 39.74533 Sochi Russia
## Region Subregion Year order
## 310 Eastern Europe East Europe 2021 38
## 311 Eastern Europe East Europe 2021 38
## 312 Eastern Europe East Europe 2021 38
## 313 Eastern Europe East Europe 2021 38
## 314 Eastern Europe East Europe 2021 38
## 315 Eastern Europe East Europe 2021 38
Q-values for k=15
make a palette with 15 colors
colors2 <-c(
"v1" = "#F49AC2",
"v2" = "purple",
"v3" = "#FF8C1A",
"v4" = "#77DD77",
"v5" = "#1E90FF",
"v6" = "blue",
"v7" = "green",
"v8" = "yellow2",
"v9" = "#008080",
"v10" = "#B22222",
"v11" = "#75FAFF",
"v12" = "chocolate4",
"v13" = "purple4",
"v14" = "green4",
"v15" = "#FFB347"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#F49AC2" "purple" "#FF8C1A" "#77DD77" "#1E90FF" "blue"
## v7 v8 v9 v10 v11 v12
## "green" "yellow2" "#008080" "#B22222" "#75FAFF" "chocolate4"
## v13 v14 v15
## "purple4" "green4" "#FFB347"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k15_pie_all_countries_zoom.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k15_pie_all_countries_outlines.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K5 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.5.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k5run4 <- k5run4 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k5run4)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000005 0.000005 0.0292 0.971 0.000005
## 2 0.000005 0.000005 0.000005 1.00 0.000005
## 3 0.000005 0.000005 0.000005 1.00 0.000005
## 4 0.000005 0.104 0.00251 0.894 0.000005
## 5 0.000005 0.101 0.000005 0.871 0.0275
## 6 0.000005 0.0316 0.000005 0.889 0.0796
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650
Rename the columns
# Rename the columns starting from the third one
k5run4 <- k5run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k5run4)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 5e-06 0.000005 0.029175 0.970810 0.000005
## 2 1066 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 3 1067 SOC 5e-06 0.000005 0.000005 0.999980 0.000005
## 4 1068 SOC 5e-06 0.103530 0.002506 0.893954 0.000005
## 5 1069 SOC 5e-06 0.101074 0.000005 0.871372 0.027544
## 6 1070 SOC 5e-06 0.031613 0.000005 0.888727 0.079650
Merge with pops
# Add an index column to Q_tibble
k5run4$index <- seq_len(nrow(k5run4))
# Perform the merge as before
df1 <-
merge(
k5run4,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 310 SOC 1065 5e-06 0.000005 0.029175 0.970810 0.000005 43.60042 39.74533
## 311 SOC 1066 5e-06 0.000005 0.000005 0.999980 0.000005 43.60042 39.74533
## 312 SOC 1067 5e-06 0.000005 0.000005 0.999980 0.000005 43.60042 39.74533
## 313 SOC 1068 5e-06 0.103530 0.002506 0.893954 0.000005 43.60042 39.74533
## 314 SOC 1069 5e-06 0.101074 0.000005 0.871372 0.027544 43.60042 39.74533
## 315 SOC 1070 5e-06 0.031613 0.000005 0.888727 0.079650 43.60042 39.74533
## Pop_City Country Region Subregion Year order
## 310 Sochi Russia Eastern Europe East Europe 2021 38
## 311 Sochi Russia Eastern Europe East Europe 2021 38
## 312 Sochi Russia Eastern Europe East Europe 2021 38
## 313 Sochi Russia Eastern Europe East Europe 2021 38
## 314 Sochi Russia Eastern Europe East Europe 2021 38
## 315 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=5
make a palette with 5 colors
colors2 <-c(
"v1" = "#FFFF19",
"v2" = "#FF8C1A",
"v3" = "#77DD37",
"v4" = "purple3",
"v5" = "#1E90FF"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k5_pie_all_countries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k5_pie_all_countries_outlines.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K18 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k18run2 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run002/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k18run2 <- k18run2 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k18run2)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1e-6 1e-6 1e-6 1e-6 2.77e-1 1e-6 1e-6 6.23e-1 1e-6 1e-6 1 e-6 1e-6
## 2 1e-6 1e-6 1e-6 1e-6 1.00e+0 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6
## 3 1e-6 1e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6
## 4 1e-6 1e-6 1e-6 1e-6 5.60e-1 1e-6 1e-6 4.33e-1 1e-6 1e-6 6.86e-3 1e-6
## 5 1e-6 1e-6 1e-6 1e-6 2.69e-1 1e-6 1e-6 7.31e-1 1e-6 1e-6 1 e-6 1e-6
## 6 1e-6 1e-6 1e-6 1e-6 4.14e-1 1e-6 1e-6 3.85e-1 1e-6 1e-6 1 e-6 1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## X11 X12 X13 X14 X15 X16 X17 X18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06
Rename the columns
# Rename the columns starting from the third one
k18run2 <- k18run2 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k18run2)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06
Merge with pops
# Add an index column to Q_tibble
k18run2$index <- seq_len(nrow(k18run2))
# Perform the merge as before
df1 <-
merge(
k18run2,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 310 SOC 1065 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 311 SOC 1066 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 312 SOC 1067 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 313 SOC 1068 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 314 SOC 1069 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 315 SOC 1070 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18 Latitude Longitude
## 310 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06 43.60042 39.74533
## 311 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042 39.74533
## 312 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06 43.60042 39.74533
## 313 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042 39.74533
## 314 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042 39.74533
## 315 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06 43.60042 39.74533
## Pop_City Country Region Subregion Year order
## 310 Sochi Russia Eastern Europe East Europe 2021 38
## 311 Sochi Russia Eastern Europe East Europe 2021 38
## 312 Sochi Russia Eastern Europe East Europe 2021 38
## 313 Sochi Russia Eastern Europe East Europe 2021 38
## 314 Sochi Russia Eastern Europe East Europe 2021 38
## 315 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=18
make a palette with 18 colors
colors2 <-c(
"v1" = "#008080",
"v2" = "#1E90FF",
"v3" = "green4",
"v4" = "#FFB347",
"v5" = "#B20CD9",
"v6" = "orchid",
"v7" = "blue",
"v8" = "purple",
"v9" = "#75FAFF",
"v10" = "#B22222",
"v11" = "#F49AC2",
"v12" = "yellow2",
"v13" = "green",
"v14" = "#FF8C1A",
"v15" = "chocolate4",
"v16" = "purple4",
"v17" = "#77DD77",
"v18" = "#FFFF99"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k18_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K14 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k14run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.14.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k14run4 <- k14run4 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k14run4)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 6.09e-1 0.391 2e-6 2e-6 2e-6
## 2 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 2 e-6 1.00 2e-6 2e-6 2e-6
## 3 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 3.98e-1 0.602 2e-6 2e-6 2e-6
## 4 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 4.37e-1 0.563 2e-6 2e-6 2e-6
## 5 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 6.67e-1 0.333 2e-6 2e-6 2e-6
## 6 0.000002 2e-6 2e-6 1.02e-1 2e-6 2e-6 2e-6 4.70e-1 0.428 2e-6 2e-6 2e-6
## # ℹ 2 more variables: X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
## X11 X12 X13 X14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06
Rename the columns
# Rename the columns starting from the third one
k14run4 <- k14run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k14run4)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
## v11 v12 v13 v14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06
Merge with pops
# Add an index column to Q_tibble
k14run4$index <- seq_len(nrow(k14run4))
# Perform the merge as before
df1 <-
merge(
k14run4,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9
## 310 SOC 1065 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201
## 311 SOC 1066 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977
## 312 SOC 1067 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095
## 313 SOC 1068 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699
## 314 SOC 1069 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931
## 315 SOC 1070 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289
## v10 v11 v12 v13 v14 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 311 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 312 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 313 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 314 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 315 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## Region Subregion Year order
## 310 Eastern Europe East Europe 2021 38
## 311 Eastern Europe East Europe 2021 38
## 312 Eastern Europe East Europe 2021 38
## 313 Eastern Europe East Europe 2021 38
## 314 Eastern Europe East Europe 2021 38
## 315 Eastern Europe East Europe 2021 38
Q-values for k=14
make a palette with 14 colors
colors2 <-c(
"v1" = "#F49AC2",
"v2" = "#77DD77",
"v3" = "blue",
"v4" = "#FF8C1A",
"v5" = "#B20CC9",
"v6" = "green4",
"v7" = "#B22222",
"v8" = "purple4",
"v9" = "purple",
"v10" = "chocolate4",
"v11" = "#1E90FF",
"v12" = "#008080",
"v13" = "green",
"v14" = "yellow2"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k14_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K13 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run8 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run008/simple.13.meanQ"
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k13run8 <- k13run8 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k13run8)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.546 0.443 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 1.17e-2 2 e-6 2e-6
## 2 0.000002 1.00 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 3 0.000002 1.00 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 4 0.000002 0.843 2e-6 2 e-6 2e-6 1.57e-1 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 5 0.483 0.475 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 4.20e-2 2 e-6 2e-6
## 6 0.313 0.577 2e-6 2.64e-2 2e-6 2 e-6 2e-6 2e-6 2 e-6 8.40e-2 2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
## X10 X11 X12 X13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06
Rename the columns
# Rename the columns starting from the third one
k13run8 <- k13run8 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run8)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
## v10 v11 v12 v13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06
Merge with pops
# Add an index column to Q_tibble
k13run8$index <- seq_len(nrow(k13run8))
# Perform the merge as before
df1 <-
merge(
k13run8,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8
## 310 SOC 1065 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 311 SOC 1066 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 312 SOC 1067 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 313 SOC 1068 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06
## 314 SOC 1069 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 315 SOC 1070 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06
## v9 v10 v11 v12 v13 Latitude Longitude Pop_City Country
## 310 0.011724 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 311 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 312 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 313 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 314 0.042005 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 315 0.000002 0.084006 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## Region Subregion Year order
## 310 Eastern Europe East Europe 2021 38
## 311 Eastern Europe East Europe 2021 38
## 312 Eastern Europe East Europe 2021 38
## 313 Eastern Europe East Europe 2021 38
## 314 Eastern Europe East Europe 2021 38
## 315 Eastern Europe East Europe 2021 38
Q-values for k=13
make a palette with 13 colors
colors2 <-c(
"v1" = "purple4",
"v2" = "purple",
"v3" = "blue",
"v4" = "#FF8C1A",
"v5" = "chocolate4",
"v6" = "#B20CD9",
"v7" = "#B22222",
"v8" = "#1E90FF",
"v9" = "green",
"v10" = "yellow2",
"v11" = "#77DD77",
"v12" = "#008080",
"v13" = "#F49AC2"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("output", "europe", "figures", "fastStructure", "MAF_1", "fastStructure_MAF1_r01_k13_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
####5.3.1 k=15 Import the Q matrix (K15 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run07 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run007/simple.15.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k15run07 <- k15run07 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k15run07)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 2 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 3 0.000002 1.00 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 4 0.000002 0.927 0.0732 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 5 0.000002 0.996 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## 6 0.000002 0.999 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## X11 X12 X13 X14 X15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742
Rename the columns
# Rename the columns starting from the third one
k15run07 <- k15run07 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run07)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 2 1066 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 3 1067 SOC 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 4 1068 SOC 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 5 1069 SOC 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 6 1070 SOC 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## v11 v12 v13 v14 v15
## 1 2e-06 2e-06 2e-06 2e-06 0.000002
## 2 2e-06 2e-06 2e-06 2e-06 0.000002
## 3 2e-06 2e-06 2e-06 2e-06 0.000002
## 4 2e-06 2e-06 2e-06 2e-06 0.000002
## 5 2e-06 2e-06 2e-06 2e-06 0.004162
## 6 2e-06 2e-06 2e-06 2e-06 0.000742
Merge with pops
# Add an index column to Q_tibble
k15run07$index <- seq_len(nrow(k15run07))
# Perform the merge as before
df1 <-
merge(
k15run07,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 310 SOC 1065 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 311 SOC 1066 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 312 SOC 1067 2e-06 0.999977 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 313 SOC 1068 2e-06 0.926789 0.073190 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 314 SOC 1069 2e-06 0.995817 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## 315 SOC 1070 2e-06 0.999237 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06
## v11 v12 v13 v14 v15 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 311 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 312 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 313 2e-06 2e-06 2e-06 2e-06 0.000002 43.60042 39.74533 Sochi Russia
## 314 2e-06 2e-06 2e-06 2e-06 0.004162 43.60042 39.74533 Sochi Russia
## 315 2e-06 2e-06 2e-06 2e-06 0.000742 43.60042 39.74533 Sochi Russia
## Region Subregion Year order
## 310 Eastern Europe East Europe 2021 38
## 311 Eastern Europe East Europe 2021 38
## 312 Eastern Europe East Europe 2021 38
## 313 Eastern Europe East Europe 2021 38
## 314 Eastern Europe East Europe 2021 38
## 315 Eastern Europe East Europe 2021 38
Q-values for k=15
make a palette with 15 colors
colors2 <-c(
"v1" = "#F49AC2",
"v2" = "purple",
"v3" = "#FF8C1A",
"v4" = "#77DD77",
"v5" = "#1E90FF",
"v6" = "blue",
"v7" = "green",
"v8" = "yellow2",
"v9" = "#008080",
"v10" = "#B22222",
"v11" = "#75FAFF",
"v12" = "chocolate4",
"v13" = "purple4",
"v14" = "green4",
"v15" = "#FFB347"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#F49AC2" "purple" "#FF8C1A" "#77DD77" "#1E90FF" "blue"
## v7 v8 v9 v10 v11 v12
## "green" "yellow2" "#008080" "#B22222" "#75FAFF" "chocolate4"
## v13 v14 v15
## "purple4" "green4" "#FFB347"
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_italy.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_fareast.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_iberia.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k15_pie_balkans_labels.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
Import the Q matrix (K14 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k14run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run004/simple.14.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k14run4 <- k14run4 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k14run4)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 6.09e-1 0.391 2e-6 2e-6 2e-6
## 2 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 2 e-6 1.00 2e-6 2e-6 2e-6
## 3 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 3.98e-1 0.602 2e-6 2e-6 2e-6
## 4 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 4.37e-1 0.563 2e-6 2e-6 2e-6
## 5 0.000002 2e-6 2e-6 2 e-6 2e-6 2e-6 2e-6 6.67e-1 0.333 2e-6 2e-6 2e-6
## 6 0.000002 2e-6 2e-6 1.02e-1 2e-6 2e-6 2e-6 4.70e-1 0.428 2e-6 2e-6 2e-6
## # ℹ 2 more variables: X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
## X11 X12 X13 X14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06
Rename the columns
# Rename the columns starting from the third one
k14run4 <- k14run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k14run4)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201 2e-06
## 2 1066 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977 2e-06
## 3 1067 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095 2e-06
## 4 1068 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699 2e-06
## 5 1069 SOC 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931 2e-06
## 6 1070 SOC 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289 2e-06
## v11 v12 v13 v14
## 1 2e-06 2e-06 2e-06 2e-06
## 2 2e-06 2e-06 2e-06 2e-06
## 3 2e-06 2e-06 2e-06 2e-06
## 4 2e-06 2e-06 2e-06 2e-06
## 5 2e-06 2e-06 2e-06 2e-06
## 6 2e-06 2e-06 2e-06 2e-06
Merge with pops
# Add an index column to Q_tibble
k14run4$index <- seq_len(nrow(k14run4))
# Perform the merge as before
df1 <-
merge(
k14run4,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9
## 310 SOC 1065 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.608778 0.391201
## 311 SOC 1066 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.000002 0.999977
## 312 SOC 1067 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.397884 0.602095
## 313 SOC 1068 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.437280 0.562699
## 314 SOC 1069 2e-06 2e-06 2e-06 0.000002 2e-06 2e-06 2e-06 0.667048 0.332931
## 315 SOC 1070 2e-06 2e-06 2e-06 0.101578 2e-06 2e-06 2e-06 0.470114 0.428289
## v10 v11 v12 v13 v14 Latitude Longitude Pop_City Country
## 310 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 311 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 312 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 313 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 314 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 315 2e-06 2e-06 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## Region Subregion Year order
## 310 Eastern Europe East Europe 2021 38
## 311 Eastern Europe East Europe 2021 38
## 312 Eastern Europe East Europe 2021 38
## 313 Eastern Europe East Europe 2021 38
## 314 Eastern Europe East Europe 2021 38
## 315 Eastern Europe East Europe 2021 38
Q-values for k=14
make a palette with 14 colors
colors2 <-c(
"v1" = "#F49AC2",
"v2" = "#77DD77",
"v3" = "blue",
"v4" = "#FF8C1A",
"v5" = "#B20CC9",
"v6" = "green4",
"v7" = "#B22222",
"v8" = "purple4",
"v9" = "purple",
"v10" = "chocolate4",
"v11" = "#1E90FF",
"v12" = "#008080",
"v13" = "green",
"v14" = "yellow2"
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_italy.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_fareast.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_iberia.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k14_pie_balkans_labels.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
Import the Q matrix (K13 for fastStructure) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run8 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run008/simple.13.meanQ"
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k13run8 <- k13run8 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k13run8)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.546 0.443 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 1.17e-2 2 e-6 2e-6
## 2 0.000002 1.00 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 3 0.000002 1.00 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 4 0.000002 0.843 2e-6 2 e-6 2e-6 1.57e-1 2e-6 2e-6 2 e-6 2 e-6 2e-6
## 5 0.483 0.475 2e-6 2 e-6 2e-6 2 e-6 2e-6 2e-6 4.20e-2 2 e-6 2e-6
## 6 0.313 0.577 2e-6 2.64e-2 2e-6 2 e-6 2e-6 2e-6 2 e-6 8.40e-2 2e-6
## # ℹ 2 more variables: X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
## X10 X11 X12 X13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06
Rename the columns
# Rename the columns starting from the third one
k13run8 <- k13run8 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run8)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9
## 1 1065 SOC 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.011724
## 2 1066 SOC 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 3 1067 SOC 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.000002
## 4 1068 SOC 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06 0.000002
## 5 1069 SOC 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06 0.042005
## 6 1070 SOC 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06 0.000002
## v10 v11 v12 v13
## 1 0.000002 2e-06 2e-06 2e-06
## 2 0.000002 2e-06 2e-06 2e-06
## 3 0.000002 2e-06 2e-06 2e-06
## 4 0.000002 2e-06 2e-06 2e-06
## 5 0.000002 2e-06 2e-06 2e-06
## 6 0.084006 2e-06 2e-06 2e-06
Merge with pops
# Add an index column to Q_tibble
k13run8$index <- seq_len(nrow(k13run8))
# Perform the merge as before
df1 <-
merge(
k13run8,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8
## 310 SOC 1065 0.545699 0.442558 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 311 SOC 1066 0.000002 0.999977 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 312 SOC 1067 0.000002 0.999978 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 313 SOC 1068 0.000002 0.843129 2e-06 0.000002 2e-06 0.156850 2e-06 2e-06
## 314 SOC 1069 0.482681 0.475295 2e-06 0.000002 2e-06 0.000002 2e-06 2e-06
## 315 SOC 1070 0.312670 0.576942 2e-06 0.026365 2e-06 0.000002 2e-06 2e-06
## v9 v10 v11 v12 v13 Latitude Longitude Pop_City Country
## 310 0.011724 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 311 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 312 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 313 0.000002 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 314 0.042005 0.000002 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## 315 0.000002 0.084006 2e-06 2e-06 2e-06 43.60042 39.74533 Sochi Russia
## Region Subregion Year order
## 310 Eastern Europe East Europe 2021 38
## 311 Eastern Europe East Europe 2021 38
## 312 Eastern Europe East Europe 2021 38
## 313 Eastern Europe East Europe 2021 38
## 314 Eastern Europe East Europe 2021 38
## 315 Eastern Europe East Europe 2021 38
Q-values for k=13
make a palette with 13 colors
colors2 <-c(
"v1" = "purple4",
"v2" = "purple",
"v3" = "blue",
"v4" = "#FF8C1A",
"v5" = "chocolate4",
"v6" = "#B20CD9",
"v7" = "#B22222",
"v8" = "#1E90FF",
"v9" = "green",
"v10" = "yellow2",
"v11" = "#77DD77",
"v12" = "#008080",
"v13" = "#F49AC2"
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_italy.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_fareast.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_iberia.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k13_pie_balkans_labels.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
Import the Q matrix (K18 for fastStructure)
# Extract ancestry coefficients
k18run2 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run002/simple.18.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k18run2 <- k18run2 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k18run2)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1e-6 1e-6 1e-6 1e-6 2.77e-1 1e-6 1e-6 6.23e-1 1e-6 1e-6 1 e-6 1e-6
## 2 1e-6 1e-6 1e-6 1e-6 1.00e+0 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6
## 3 1e-6 1e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6 1e-6 1 e-6 1e-6
## 4 1e-6 1e-6 1e-6 1e-6 5.60e-1 1e-6 1e-6 4.33e-1 1e-6 1e-6 6.86e-3 1e-6
## 5 1e-6 1e-6 1e-6 1e-6 2.69e-1 1e-6 1e-6 7.31e-1 1e-6 1e-6 1 e-6 1e-6
## 6 1e-6 1e-6 1e-6 1e-6 4.14e-1 1e-6 1e-6 3.85e-1 1e-6 1e-6 1 e-6 1e-6
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## X11 X12 X13 X14 X15 X16 X17 X18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06
Rename the columns
# Rename the columns starting from the third one
k18run2 <- k18run2 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k18run2)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 2 1066 SOC 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 3 1067 SOC 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 4 1068 SOC 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 5 1069 SOC 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 6 1070 SOC 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18
## 1 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06
## 2 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 3 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06
## 4 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 5 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06
## 6 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06
Merge with pops
# Add an index column to Q_tibble
k18run2$index <- seq_len(nrow(k18run2))
# Perform the merge as before
df1 <-
merge(
k18run2,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 310 SOC 1065 1e-06 1e-06 1e-06 1e-06 0.277354 1e-06 1e-06 0.622699 1e-06 1e-06
## 311 SOC 1066 1e-06 1e-06 1e-06 1e-06 0.999977 1e-06 1e-06 0.000001 1e-06 1e-06
## 312 SOC 1067 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06 1e-06 0.000001 1e-06 1e-06
## 313 SOC 1068 1e-06 1e-06 1e-06 1e-06 0.559749 1e-06 1e-06 0.433370 1e-06 1e-06
## 314 SOC 1069 1e-06 1e-06 1e-06 1e-06 0.268843 1e-06 1e-06 0.731136 1e-06 1e-06
## 315 SOC 1070 1e-06 1e-06 1e-06 1e-06 0.413797 1e-06 1e-06 0.384657 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18 Latitude Longitude
## 310 0.000001 1e-06 1e-06 1e-06 1e-06 0.099926 0.000001 1e-06 43.60042 39.74533
## 311 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042 39.74533
## 312 0.000001 1e-06 1e-06 1e-06 1e-06 0.999977 0.000001 1e-06 43.60042 39.74533
## 313 0.006860 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042 39.74533
## 314 0.000001 1e-06 1e-06 1e-06 1e-06 0.000001 0.000001 1e-06 43.60042 39.74533
## 315 0.000001 1e-06 1e-06 1e-06 1e-06 0.141181 0.060346 1e-06 43.60042 39.74533
## Pop_City Country Region Subregion Year order
## 310 Sochi Russia Eastern Europe East Europe 2021 38
## 311 Sochi Russia Eastern Europe East Europe 2021 38
## 312 Sochi Russia Eastern Europe East Europe 2021 38
## 313 Sochi Russia Eastern Europe East Europe 2021 38
## 314 Sochi Russia Eastern Europe East Europe 2021 38
## 315 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=18
make a palette with 18 colors
colors2 <-c(
"v1" = "#008080",
"v2" = "#1E90FF",
"v3" = "green4",
"v4" = "#FFB347",
"v5" = "#B20CD9",
"v6" = "orchid",
"v7" = "blue",
"v8" = "purple",
"v9" = "#75FAFF",
"v10" = "#B22222",
"v11" = "#F49AC2",
"v12" = "yellow2",
"v13" = "green",
"v14" = "#FF8C1A",
"v15" = "chocolate4",
"v16" = "purple4",
"v17" = "#77DD77",
"v18" = "#FFFF99"
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_italy.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
#countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
#countries_with_data2 <- unique(df1$Country)
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_fareast.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_iberia.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/figures/fastStructure/MAF_1/fastStructure_MAF1_k18_pie_balkans_labels.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run4 <- read_delim(
here("output", "europe", "admixture", "r2_0.1.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k15run4)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.361 0.0551 0.00001 0.00001 0.00342 0.00001 0.00001 0.00001 0.00001
## 2 0.892 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.00556 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.575 0.0486 0.00730 0.00001 0.00001 0.00001 0.00001 0.00001 0.00577
## 5 0.390 0.0630 0.00001 0.0300 0.000727 0.00793 0.00452 0.00001 0.00001
## 6 0.429 0.0622 0.00001 0.00001 0.0177 0.00001 0.00001 0.0188 0.0319
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7
## 1 1065 SOC 0.361376 0.055147 0.000010 0.000010 0.003424 0.000010 0.000010
## 2 1066 SOC 0.892415 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.005563 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.574814 0.048574 0.007302 0.000010 0.000010 0.000010 0.000010
## 5 1069 SOC 0.390191 0.063016 0.000010 0.029971 0.000727 0.007935 0.004521
## 6 1070 SOC 0.429234 0.062239 0.000010 0.000010 0.017672 0.000010 0.000010
## X8 X9 X10 X11 X12 X13 X14 X15
## 1 0.000010 0.000010 0.356106 0.000010 0.223847 0.00001 0.000010 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.107455 0.00001 0.000010 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.994307 0.00001 0.000010 0.000010
## 4 0.000010 0.005773 0.312298 0.000849 0.023455 0.00305 0.004865 0.018971
## 5 0.000010 0.000010 0.482465 0.000010 0.019218 0.00001 0.000010 0.001896
## 6 0.018843 0.031885 0.259303 0.000010 0.175995 0.00001 0.004243 0.000528
Rename the columns
# Rename the columns starting from the third one
k15run4 <- k15run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run4)
## ind pop v1 v2 v3 v4 v5 v6 v7
## 1 1065 SOC 0.361376 0.055147 0.000010 0.000010 0.003424 0.000010 0.000010
## 2 1066 SOC 0.892415 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.005563 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.574814 0.048574 0.007302 0.000010 0.000010 0.000010 0.000010
## 5 1069 SOC 0.390191 0.063016 0.000010 0.029971 0.000727 0.007935 0.004521
## 6 1070 SOC 0.429234 0.062239 0.000010 0.000010 0.017672 0.000010 0.000010
## v8 v9 v10 v11 v12 v13 v14 v15
## 1 0.000010 0.000010 0.356106 0.000010 0.223847 0.00001 0.000010 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.107455 0.00001 0.000010 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.994307 0.00001 0.000010 0.000010
## 4 0.000010 0.005773 0.312298 0.000849 0.023455 0.00305 0.004865 0.018971
## 5 0.000010 0.000010 0.482465 0.000010 0.019218 0.00001 0.000010 0.001896
## 6 0.018843 0.031885 0.259303 0.000010 0.175995 0.00001 0.004243 0.000528
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k15run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"blue",
"#77DD77",
"#F49AC2",
"green4",
"yellow2",
"#FFFF99",
"purple4",
"chocolate4",
"#FF8C1A",
"#008080",
"#B20CD9",
"green",
"#1E90FF",
"purple",
"#B22222"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n Admixture for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/run4/r2_0.1.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run4)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0101 0.000045 0.0314 0.946 0.0127
## 2 0.000012 0.00001 0.00001 1.00 0.00001
## 3 0.00001 0.00001 0.00002 1.00 0.00001
## 4 0.00947 0.0335 0.0581 0.899 0.00001
## 5 0.0220 0.00637 0.0356 0.889 0.0471
## 6 0.0396 0.00998 0.0868 0.864 0.00001
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k5run4 <- k5run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k5run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c( "#1E90FF",
"red",
"purple3",
"#FFFF19",
"#FF8C1A")
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k6run3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/run3/r2_0.1.6.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k6run3)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0245 0.00001 0.940 0.0120 0.0238 0.00001
## 2 0.00001 0.00001 1.00 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 1.00 0.00001 0.00001 0.00001
## 4 0.0481 0.00001 0.893 0.00001 0.0466 0.0118
## 5 0.0243 0.000011 0.882 0.0473 0.0466 0.00001
## 6 0.0809 0.0226 0.860 0.00001 0.0365 0.00001
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k6run3 <- k6run3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k6run3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FFFF19",
"#77DD37",
"#FF8C1A",
"red",
"purple3",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture for k1:40 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run4/r2_0.01.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k15run4)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.00001 0.00001 0.0802 0.00001 0.00001 0.511 0.00001 0.00001
## 2 0.00001 0.00001 0.00001 0.00001 0.000011 0.00001 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.802 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.000302 0.00001 0.00001 0.0385 0.0181 0.00721 0.310 0.0385 0.0240
## 5 0.00001 0.0311 0.0327 0.0640 0.00001 0.00001 0.452 0.00855 0.00728
## 6 0.0141 0.00001 0.223 0.0812 0.00001 0.00663 0.232 0.00001 0.00001
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7
## 1 1065 SOC 0.000010 0.000010 0.000010 0.080208 0.000010 0.000010 0.511214
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 0.000011 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801950 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.000302 0.000010 0.000010 0.038536 0.018128 0.007211 0.310458
## 5 1069 SOC 0.000010 0.031117 0.032690 0.064019 0.000010 0.000010 0.452264
## 6 1070 SOC 0.014112 0.000010 0.223026 0.081151 0.000010 0.006632 0.231936
## X8 X9 X10 X11 X12 X13 X14 X15
## 1 0.000010 0.000010 0.000010 0.369351 1e-05 0.039116 0.00001 0.000010
## 2 0.000010 0.000010 0.000010 0.279744 1e-05 0.720126 0.00001 0.000010
## 3 0.000010 0.000010 0.000010 0.073619 1e-05 0.124311 0.00001 0.000010
## 4 0.038546 0.023974 0.000010 0.200266 1e-05 0.360237 0.00001 0.002292
## 5 0.008548 0.007282 0.046382 0.078845 1e-05 0.274773 0.00001 0.004030
## 6 0.000010 0.000010 0.031754 0.017665 1e-05 0.354584 0.03908 0.000010
Rename the columns
# Rename the columns starting from the third one
k15run4 <- k15run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run4)
## ind pop v1 v2 v3 v4 v5 v6 v7
## 1 1065 SOC 0.000010 0.000010 0.000010 0.080208 0.000010 0.000010 0.511214
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 0.000011 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801950 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.000302 0.000010 0.000010 0.038536 0.018128 0.007211 0.310458
## 5 1069 SOC 0.000010 0.031117 0.032690 0.064019 0.000010 0.000010 0.452264
## 6 1070 SOC 0.014112 0.000010 0.223026 0.081151 0.000010 0.006632 0.231936
## v8 v9 v10 v11 v12 v13 v14 v15
## 1 0.000010 0.000010 0.000010 0.369351 1e-05 0.039116 0.00001 0.000010
## 2 0.000010 0.000010 0.000010 0.279744 1e-05 0.720126 0.00001 0.000010
## 3 0.000010 0.000010 0.000010 0.073619 1e-05 0.124311 0.00001 0.000010
## 4 0.038546 0.023974 0.000010 0.200266 1e-05 0.360237 0.00001 0.002292
## 5 0.008548 0.007282 0.046382 0.078845 1e-05 0.274773 0.00001 0.004030
## 6 0.000010 0.000010 0.031754 0.017665 1e-05 0.354584 0.03908 0.000010
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k15run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FF8C1A",
"purple4",
"#1E90FF",
"orchid",
"blue",
"purple",
"chocolate4",
"#77DD77",
"#B22222",
"#F49AC2",
"green",
"yellow2",
"#008080",
"#B20CD9",
"green4"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n Admixture for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run4/r2_0.01.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run4)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.0449 0.0554 0.872 0.0274
## 2 0.000829 0.0217 0.00001 0.977 0.00001
## 3 0.00001 0.00001 0.00001 1.00 0.00001
## 4 0.0159 0.0888 0.0678 0.827 0.00001
## 5 0.0171 0.0779 0.0479 0.810 0.0469
## 6 0.0309 0.0210 0.127 0.821 0.00001
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k5run4 <- k5run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k5run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FFFF19",
"red",
"purple3",
"#1E90FF",
"#FF8C1A")
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k6run1 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run1/r2_0.01.6.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k6run1)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0190 0.867 0.00001 0.0224 0.0407 0.0511
## 2 0.00001 0.973 0.000012 0.00001 0.0274 0.00001
## 3 0.00001 1.00 0.00001 0.00001 0.00001 0.00001
## 4 0.0225 0.820 0.00807 0.00001 0.0990 0.0508
## 5 0.00997 0.802 0.00693 0.0542 0.104 0.0231
## 6 0.00001 0.811 0.0240 0.00001 0.0530 0.112
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k6run1 <- k6run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k6run1 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#1E90FF",
"purple3",
"#FFFF19",
"#77DD37",
"#FF8C1A",
"red"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture for k1:40 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run4/r2_0.01.13.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k13run4)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.394 9.30e-3 0.0164 0.0179 0.273 0.00001 1.5 e-5 0.222 0.00001
## 2 0.00001 0.0192 1 e-5 0.00001 0.00001 0.827 0.00001 1 e-5 0.118 0.00001
## 3 0.00001 0.00001 1 e-5 0.00001 0.00001 0.0938 0.00001 1 e-5 0.906 0.00001
## 4 0.000074 0.329 6.18e-3 0.00233 0.0441 0.497 0.0180 1 e-5 0.0280 0.00698
## 5 0.00001 0.465 7.41e-4 0.00001 0.0112 0.345 0.00001 3.21e-2 0.0283 0.0431
## 6 0.0164 0.242 1 e-5 0.0391 0.00001 0.381 0.00716 1 e-5 0.199 0.0238
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7
## 1 1065 SOC 0.000010 0.394353 0.009296 0.016374 0.017866 0.273391 0.000010
## 2 1066 SOC 0.000010 0.019216 0.000010 0.000010 0.000010 0.826863 0.000010
## 3 1067 SOC 0.000010 0.000010 0.000010 0.000010 0.000010 0.093768 0.000010
## 4 1068 SOC 0.000074 0.329223 0.006175 0.002333 0.044095 0.496514 0.017950
## 5 1069 SOC 0.000010 0.464799 0.000741 0.000010 0.011165 0.345466 0.000010
## 6 1070 SOC 0.016362 0.242028 0.000010 0.039087 0.000010 0.381455 0.007155
## X8 X9 X10 X11 X12 X13
## 1 0.000015 0.221642 0.000010 0.000010 0.067012 0.000010
## 2 0.000010 0.117549 0.000010 0.036281 0.000010 0.000010
## 3 0.000010 0.906122 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.027967 0.006979 0.025743 0.038663 0.004274
## 5 0.032062 0.028323 0.043070 0.002007 0.066806 0.005531
## 6 0.000010 0.198886 0.023795 0.000010 0.091182 0.000010
Rename the columns
# Rename the columns starting from the third one
k13run4 <- k13run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run4)
## ind pop v1 v2 v3 v4 v5 v6 v7
## 1 1065 SOC 0.000010 0.394353 0.009296 0.016374 0.017866 0.273391 0.000010
## 2 1066 SOC 0.000010 0.019216 0.000010 0.000010 0.000010 0.826863 0.000010
## 3 1067 SOC 0.000010 0.000010 0.000010 0.000010 0.000010 0.093768 0.000010
## 4 1068 SOC 0.000074 0.329223 0.006175 0.002333 0.044095 0.496514 0.017950
## 5 1069 SOC 0.000010 0.464799 0.000741 0.000010 0.011165 0.345466 0.000010
## 6 1070 SOC 0.016362 0.242028 0.000010 0.039087 0.000010 0.381455 0.007155
## v8 v9 v10 v11 v12 v13
## 1 0.000015 0.221642 0.000010 0.000010 0.067012 0.000010
## 2 0.000010 0.117549 0.000010 0.036281 0.000010 0.000010
## 3 0.000010 0.906122 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.027967 0.006979 0.025743 0.038663 0.004274
## 5 0.032062 0.028323 0.043070 0.002007 0.066806 0.005531
## 6 0.000010 0.198886 0.023795 0.000010 0.091182 0.000010
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k13run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"blue",
"green",
"#1E90FF",
"#B20CD9",
"#FF8C1A",
"#B22222",
"#F49AC2",
"#77DD77",
"green4",
"#008080",
"purple4",
"purple",
"yellow2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n Admixture for 17,028 SNPs (Set 1).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run4/r2_0.01_b.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k15run4)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.00001 0.00491 0.000013 1e-5 0.00001 0.524 0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.00001 0.00001 1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.802 0.00001 1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.00187 0.00001 0.00001 0.00001 1e-5 0.0177 0.332 0.0316 0.0252 0.00001
## 5 0.00001 0.0225 0.0319 0.00001 1e-5 0.00001 0.455 0.00589 0.00803 0.0362
## 6 0.0125 0.00001 0.227 0.0354 1e-5 0.0142 0.236 0.00001 0.00001 0.00559
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8
## 1 1065 SOC 0.000010 0.000010 0.004907 0.000013 1e-05 0.000010 0.524182 0.000010
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801779 0.000010 1e-05 0.000010 0.000010 0.000010
## 4 1068 SOC 0.001872 0.000010 0.000010 0.000010 1e-05 0.017655 0.332402 0.031559
## 5 1069 SOC 0.000010 0.022489 0.031860 0.000010 1e-05 0.000010 0.455069 0.005887
## 6 1070 SOC 0.012493 0.000010 0.226502 0.035354 1e-05 0.014213 0.236432 0.000010
## X9 X10 X11 X12 X13 X14 X15
## 1 0.000010 0.000010 0.348159 1e-05 0.059473 0.00001 0.063177
## 2 0.000010 0.000010 0.268918 1e-05 0.730952 0.00001 0.000010
## 3 0.000010 0.000010 0.083907 1e-05 0.114194 0.00001 0.000010
## 4 0.025172 0.000010 0.190328 1e-05 0.371940 0.00001 0.029002
## 5 0.008030 0.036174 0.074614 1e-05 0.300856 0.00813 0.056840
## 6 0.000010 0.005589 0.006392 1e-05 0.380527 0.00001 0.082439
Rename the columns
# Rename the columns starting from the third one
k15run4 <- k15run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k15run4)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8
## 1 1065 SOC 0.000010 0.000010 0.004907 0.000013 1e-05 0.000010 0.524182 0.000010
## 2 1066 SOC 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 3 1067 SOC 0.000010 0.000010 0.801779 0.000010 1e-05 0.000010 0.000010 0.000010
## 4 1068 SOC 0.001872 0.000010 0.000010 0.000010 1e-05 0.017655 0.332402 0.031559
## 5 1069 SOC 0.000010 0.022489 0.031860 0.000010 1e-05 0.000010 0.455069 0.005887
## 6 1070 SOC 0.012493 0.000010 0.226502 0.035354 1e-05 0.014213 0.236432 0.000010
## v9 v10 v11 v12 v13 v14 v15
## 1 0.000010 0.000010 0.348159 1e-05 0.059473 0.00001 0.063177
## 2 0.000010 0.000010 0.268918 1e-05 0.730952 0.00001 0.000010
## 3 0.000010 0.000010 0.083907 1e-05 0.114194 0.00001 0.000010
## 4 0.025172 0.000010 0.190328 1e-05 0.371940 0.00001 0.029002
## 5 0.008030 0.036174 0.074614 1e-05 0.300856 0.00813 0.056840
## 6 0.000010 0.005589 0.006392 1e-05 0.380527 0.00001 0.082439
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k15run4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple4",
"#B20CD9",
"#F49AC2",
"blue",
"chocolate4",
"#FF8C1A",
"#B22222",
"green4",
"purple",
"orchid",
"yellow2",
"#008080",
"#77DD77",
"#1E90FF",
"green"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n Admixture for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run2 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run2/r2_0.01_b.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k5run2)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00454 0.0590 0.900 0.0363 0.00001
## 2 0.00001 0.00001 0.998 0.00166 0.00001
## 3 0.00001 0.00001 1.00 0.00001 0.00001
## 4 0.00362 0.0429 0.855 0.0918 0.00704
## 5 0.00001 0.0538 0.840 0.0960 0.0102
## 6 0.00001 0.0922 0.848 0.0405 0.0196
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k5run2 <- k5run2 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k5run2 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FF8C1A",
"#FFFF19",
"#77DD37",
"purple3",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K5 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k6run3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.6.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k6run3)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0558 0.00001 0.898 0.0130 0.0298 0.00373
## 2 0.00001 0.00001 0.997 0.00001 0.00311 0.00001
## 3 0.00001 0.00001 1.00 0.00001 0.00001 0.00001
## 4 0.0538 0.00598 0.852 0.00123 0.0848 0.00193
## 5 0.0142 0.00823 0.836 0.0474 0.0940 0.00001
## 6 0.115 0.0199 0.845 0.00001 0.0199 0.00001
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k6run3 <- k6run3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k6run3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#1E90FF",
"red",
"#FF8C1A",
"purple3",
"#FFFF19",
"#77DD37"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K13 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.13.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k13run3)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.215 0.00001 0.00001 0.00001 0.00001 0.0503 0.00001 0.00001 0.00776 0.0249
## 2 0.110 0.00001 0.00001 0.0133 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.914 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.0176 0.00001 0.0175 0.00001 0.00001 0.0294 0.00001 0.0278 0.0332 0.00463
## 5 0.0313 0.00932 0.0362 0.00001 0.0244 0.0580 0.00001 0.00001 0.00746 0.00001
## 6 0.194 0.00001 0.00001 0.00001 0.00001 0.0886 0.0143 0.0147 0.00001 0.0346
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7
## 1 1065 SOC 0.214763 0.000010 0.000010 0.00001 0.000010 0.050272 0.000010
## 2 1066 SOC 0.109712 0.000010 0.000010 0.01334 0.000010 0.000010 0.000010
## 3 1067 SOC 0.914099 0.000010 0.000010 0.00001 0.000010 0.000010 0.000010
## 4 1068 SOC 0.017640 0.000010 0.017477 0.00001 0.000010 0.029389 0.000010
## 5 1069 SOC 0.031304 0.009315 0.036161 0.00001 0.024405 0.058041 0.000010
## 6 1070 SOC 0.193658 0.000010 0.000010 0.00001 0.000010 0.088579 0.014296
## X8 X9 X10 X11 X12 X13
## 1 0.000010 0.007757 0.024929 0.406487 0.295402 0.000330
## 2 0.000010 0.000010 0.000010 0.000010 0.876847 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.085791 0.000010
## 4 0.027831 0.033217 0.004631 0.341965 0.524722 0.003088
## 5 0.000010 0.007463 0.000010 0.459753 0.373508 0.000010
## 6 0.014673 0.000010 0.034588 0.249920 0.404226 0.000010
Rename the columns
# Rename the columns starting from the third one
k13run3 <- k13run3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k13run3)
## ind pop v1 v2 v3 v4 v5 v6 v7
## 1 1065 SOC 0.214763 0.000010 0.000010 0.00001 0.000010 0.050272 0.000010
## 2 1066 SOC 0.109712 0.000010 0.000010 0.01334 0.000010 0.000010 0.000010
## 3 1067 SOC 0.914099 0.000010 0.000010 0.00001 0.000010 0.000010 0.000010
## 4 1068 SOC 0.017640 0.000010 0.017477 0.00001 0.000010 0.029389 0.000010
## 5 1069 SOC 0.031304 0.009315 0.036161 0.00001 0.024405 0.058041 0.000010
## 6 1070 SOC 0.193658 0.000010 0.000010 0.00001 0.000010 0.088579 0.014296
## v8 v9 v10 v11 v12 v13
## 1 0.000010 0.007757 0.024929 0.406487 0.295402 0.000330
## 2 0.000010 0.000010 0.000010 0.000010 0.876847 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.085791 0.000010
## 4 0.027831 0.033217 0.004631 0.341965 0.524722 0.003088
## 5 0.000010 0.007463 0.000010 0.459753 0.373508 0.000010
## 6 0.014673 0.000010 0.034588 0.249920 0.404226 0.000010
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k13run3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#008080",
"purple4",
"purple",
"blue",
"yellow2",
"#FF8C1A",
"#1E90FF",
"green4",
"green",
"#B22222",
"#77DD77",
"#B20CD9",
"#F49AC2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n Admixture for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K14 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k14run3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.14.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k14run3)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00106 0.0508 1 e-5 0.00001 1 e-5 0.407 0.00001 0.215 0.0251 0.00765
## 2 0.00001 0.00001 1 e-5 0.00001 1 e-5 0.00001 0.00001 0.110 0.00001 0.00001
## 3 0.00001 0.00001 1 e-5 0.00001 1 e-5 0.00001 0.00001 0.914 0.00001 0.00001
## 4 0.00001 0.0293 1.76e-4 0.00001 5.52e-4 0.339 0.00001 0.0180 0.00522 0.0341
## 5 0.00001 0.0567 1 e-5 0.0240 8.84e-3 0.460 0.0353 0.0308 0.00001 0.00513
## 6 0.00001 0.0871 1.31e-2 0.00001 1 e-5 0.250 0.00480 0.193 0.0331 0.00001
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7
## 1 1065 SOC 0.00106 0.050765 0.000010 0.000010 0.000010 0.406851 0.000010
## 2 1066 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.00001 0.029296 0.000176 0.000010 0.000552 0.339113 0.000010
## 5 1069 SOC 0.00001 0.056695 0.000010 0.023968 0.008836 0.459668 0.035294
## 6 1070 SOC 0.00001 0.087083 0.013070 0.000010 0.000010 0.250427 0.004795
## X8 X9 X10 X11 X12 X13 X14
## 1 0.214582 0.025094 0.007646 0.000010 0.000013 0.293928 0.000010
## 2 0.109817 0.000010 0.000010 0.013705 0.000010 0.876368 0.000010
## 3 0.914203 0.000010 0.000010 0.000010 0.000010 0.085677 0.000010
## 4 0.017951 0.005221 0.034125 0.000010 0.026843 0.523743 0.022941
## 5 0.030823 0.000010 0.005126 0.000010 0.007622 0.371918 0.000010
## 6 0.193435 0.033125 0.000010 0.000010 0.000010 0.402723 0.015282
Rename the columns
# Rename the columns starting from the third one
k14run3 <- k14run3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k14run3)
## ind pop v1 v2 v3 v4 v5 v6 v7
## 1 1065 SOC 0.00106 0.050765 0.000010 0.000010 0.000010 0.406851 0.000010
## 2 1066 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 3 1067 SOC 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1068 SOC 0.00001 0.029296 0.000176 0.000010 0.000552 0.339113 0.000010
## 5 1069 SOC 0.00001 0.056695 0.000010 0.023968 0.008836 0.459668 0.035294
## 6 1070 SOC 0.00001 0.087083 0.013070 0.000010 0.000010 0.250427 0.004795
## v8 v9 v10 v11 v12 v13 v14
## 1 0.214582 0.025094 0.007646 0.000010 0.000013 0.293928 0.000010
## 2 0.109817 0.000010 0.000010 0.013705 0.000010 0.876368 0.000010
## 3 0.914203 0.000010 0.000010 0.000010 0.000010 0.085677 0.000010
## 4 0.017951 0.005221 0.034125 0.000010 0.026843 0.523743 0.022941
## 5 0.030823 0.000010 0.005126 0.000010 0.007622 0.371918 0.000010
## 6 0.193435 0.033125 0.000010 0.000010 0.000010 0.402723 0.015282
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k14run3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"green4",
"purple",
"chocolate4",
"yellow2",
"#FF8C1A",
"#B22222",
"#B20CD9",
"#1E90FF",
"blue",
"#77DD77",
"purple4",
"#008080",
"green",
"#F49AC2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:14)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=14.\n Admixture for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K18 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k18run1 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run1/r2_0.01_b.18.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(k18run1)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.00001 0.00001 0.0148 0.00001 0.0208 0.338 0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.0675 0.00001 0.00001 0.691 0.241 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.00001 0.792 0.00001 0.128 0.0798 0.00001 0.00001 0.00001
## 4 0.00001 0.00001 0.171 0.00001 0.00079 0.374 0.183 0.00001 0.00001 0.0191
## 5 0.00370 0.00001 0.0174 0.0301 0.00354 0.285 0.0677 0.0207 0.0496 0.00001
## 6 0.00001 0.0122 0.00001 0.226 0.00001 0.363 0.00001 0.00001 0.0140 0.0122
## # ℹ 8 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## # X16 <dbl>, X17 <dbl>, X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7
## 1 1065 SOC 0.000010 0.00001 0.000010 0.014837 0.000010 0.020841 0.337713
## 2 1066 SOC 0.000010 0.00001 0.067510 0.000010 0.000010 0.691434 0.240906
## 3 1067 SOC 0.000010 0.00001 0.000010 0.792464 0.000010 0.127570 0.079816
## 4 1068 SOC 0.000010 0.00001 0.171399 0.000010 0.000790 0.374352 0.183001
## 5 1069 SOC 0.003705 0.00001 0.017397 0.030122 0.003544 0.285460 0.067722
## 6 1070 SOC 0.000010 0.01216 0.000010 0.226478 0.000010 0.362962 0.000010
## X8 X9 X10 X11 X12 X13 X14 X15 X16
## 1 0.000010 0.000010 0.000010 0.560465 0.066013 1e-05 0.000010 1e-05 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 4 0.000010 0.000010 0.019120 0.229189 0.006437 1e-05 0.015612 1e-05 0.000010
## 5 0.020749 0.049572 0.000010 0.456702 0.053867 1e-05 0.011089 1e-05 0.000010
## 6 0.000010 0.014015 0.012198 0.256707 0.080054 1e-05 0.000010 1e-05 0.035326
## X17 X18
## 1 1e-05 1e-05
## 2 1e-05 1e-05
## 3 1e-05 1e-05
## 4 1e-05 1e-05
## 5 1e-05 1e-05
## 6 1e-05 1e-05
Rename the columns
# Rename the columns starting from the third one
k18run1 <- k18run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k18run1)
## ind pop v1 v2 v3 v4 v5 v6 v7
## 1 1065 SOC 0.000010 0.00001 0.000010 0.014837 0.000010 0.020841 0.337713
## 2 1066 SOC 0.000010 0.00001 0.067510 0.000010 0.000010 0.691434 0.240906
## 3 1067 SOC 0.000010 0.00001 0.000010 0.792464 0.000010 0.127570 0.079816
## 4 1068 SOC 0.000010 0.00001 0.171399 0.000010 0.000790 0.374352 0.183001
## 5 1069 SOC 0.003705 0.00001 0.017397 0.030122 0.003544 0.285460 0.067722
## 6 1070 SOC 0.000010 0.01216 0.000010 0.226478 0.000010 0.362962 0.000010
## v8 v9 v10 v11 v12 v13 v14 v15 v16
## 1 0.000010 0.000010 0.000010 0.560465 0.066013 1e-05 0.000010 1e-05 0.000010
## 2 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 4 0.000010 0.000010 0.019120 0.229189 0.006437 1e-05 0.015612 1e-05 0.000010
## 5 0.020749 0.049572 0.000010 0.456702 0.053867 1e-05 0.011089 1e-05 0.000010
## 6 0.000010 0.014015 0.012198 0.256707 0.080054 1e-05 0.000010 1e-05 0.035326
## v17 v18
## 1 1e-05 1e-05
## 2 1e-05 1e-05
## 3 1e-05 1e-05
## 4 1e-05 1e-05
## 5 1e-05 1e-05
## 6 1e-05 1e-05
Import samples attributes
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 1
## 2 West Europe 2019 2
## 3 West Europe 2017 3
## 4 West Europe 2017 4
## 5 West Europe 2018 5
## 6 West Europe 2017 6
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- k18run1 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value")
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Region_Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Region_Country = unique(Q_ordered$Region_Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Region_Country, function(rc)
max(which(Q_ordered$Region_Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"green4",
"#75FAFF",
"goldenrod",
"yellow2",
"#B22222",
"orchid",
"#F49AC2",
"purple",
"purple4",
"#B20CD9",
"#1E90FF",
"green",
"#77DD77",
"#FF8C1A",
"chocolate4",
"#008080",
"#FFFF99",
"blue"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n Admixture for k1:25 with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k15run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run4/r2_0.01_b.15.Q"#copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k15run4 <- k15run4 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k15run4)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.00001 0.00491 0.000013 1e-5 0.00001 0.524 0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.00001 0.00001 1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.802 0.00001 1e-5 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.00187 0.00001 0.00001 0.00001 1e-5 0.0177 0.332 0.0316 0.0252 0.00001
## 5 0.00001 0.0225 0.0319 0.00001 1e-5 0.00001 0.455 0.00589 0.00803 0.0362
## 6 0.0125 0.00001 0.227 0.0354 1e-5 0.0142 0.236 0.00001 0.00001 0.00559
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k15run4 <- k15run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
k15run4$index <- seq_len(nrow(k15run4))
# Perform the merge as before
df1 <-
merge(
k15run4,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.000010 0.000010 0.004907 0.000013 1e-05 0.000010 0.524182
## 311 SOC 1066 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010
## 312 SOC 1067 0.000010 0.000010 0.801779 0.000010 1e-05 0.000010 0.000010
## 313 SOC 1068 0.001872 0.000010 0.000010 0.000010 1e-05 0.017655 0.332402
## 314 SOC 1069 0.000010 0.022489 0.031860 0.000010 1e-05 0.000010 0.455069
## 315 SOC 1070 0.012493 0.000010 0.226502 0.035354 1e-05 0.014213 0.236432
## v8 v9 v10 v11 v12 v13 v14 v15
## 310 0.000010 0.000010 0.000010 0.348159 1e-05 0.059473 0.00001 0.063177
## 311 0.000010 0.000010 0.000010 0.268918 1e-05 0.730952 0.00001 0.000010
## 312 0.000010 0.000010 0.000010 0.083907 1e-05 0.114194 0.00001 0.000010
## 313 0.031559 0.025172 0.000010 0.190328 1e-05 0.371940 0.00001 0.029002
## 314 0.005887 0.008030 0.036174 0.074614 1e-05 0.300856 0.00813 0.056840
## 315 0.000010 0.000010 0.005589 0.006392 1e-05 0.380527 0.00001 0.082439
## Latitude Longitude Pop_City Country Region Subregion Year order
## 310 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 311 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 312 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 313 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 314 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 315 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=15
make a palette with 15 colors
colors2 <-c(
"v1" = "#B22222",
"v2" = "#1E90FF",
"v3" = "#B20CD9",
"v4" = "#FFB347",
"v5" = "green4",
"v6" = "#008080",
"v7" = "purple",
"v8" = "#F49AC2",
"v9" = "chocolate4",
"v10" = "#FF8C1A",
"v11" = "orchid",
"v12" = "green",
"v13" = "purple4",
"v14" = "blue",
"v15" = "yellow2"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#B22222" "#1E90FF" "#B20CD9" "#FFB347" "green4" "#008080"
## v7 v8 v9 v10 v11 v12
## "purple" "#F49AC2" "chocolate4" "#FF8C1A" "orchid" "green"
## v13 v14 v15
## "purple4" "blue" "yellow2"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k15_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K15 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k5run2 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run2/r2_0.01_b.5.Q"#copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k5run2 <- k5run2 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k5run2)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00454 0.0590 0.900 0.0363 0.00001
## 2 0.00001 0.00001 0.998 0.00166 0.00001
## 3 0.00001 0.00001 1.00 0.00001 0.00001
## 4 0.00362 0.0429 0.855 0.0918 0.00704
## 5 0.00001 0.0538 0.840 0.0960 0.0102
## 6 0.00001 0.0922 0.848 0.0405 0.0196
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k5run2 <- k5run2 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
k5run2$index <- seq_len(nrow(k5run2))
# Perform the merge as before
df1 <-
merge(
k5run2,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 310 SOC 1065 0.004541 0.059013 0.900098 0.036339 0.000010 43.60042 39.74533
## 311 SOC 1066 0.000010 0.000010 0.998310 0.001660 0.000010 43.60042 39.74533
## 312 SOC 1067 0.000010 0.000010 0.999960 0.000010 0.000010 43.60042 39.74533
## 313 SOC 1068 0.003617 0.042869 0.854715 0.091759 0.007040 43.60042 39.74533
## 314 SOC 1069 0.000010 0.053826 0.839905 0.096034 0.010225 43.60042 39.74533
## 315 SOC 1070 0.000010 0.092154 0.847758 0.040523 0.019555 43.60042 39.74533
## Pop_City Country Region Subregion Year order
## 310 Sochi Russia Eastern Europe East Europe 2021 38
## 311 Sochi Russia Eastern Europe East Europe 2021 38
## 312 Sochi Russia Eastern Europe East Europe 2021 38
## 313 Sochi Russia Eastern Europe East Europe 2021 38
## 314 Sochi Russia Eastern Europe East Europe 2021 38
## 315 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=5
make a palette with 5 colors
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K18 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k18run1 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run1/r2_0.01_b.18.Q"#copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k18run1 <- k18run1 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k18run1)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00001 0.00001 0.00001 0.0148 0.00001 0.0208 0.338 0.00001 0.00001 0.00001
## 2 0.00001 0.00001 0.0675 0.00001 0.00001 0.691 0.241 0.00001 0.00001 0.00001
## 3 0.00001 0.00001 0.00001 0.792 0.00001 0.128 0.0798 0.00001 0.00001 0.00001
## 4 0.00001 0.00001 0.171 0.00001 0.00079 0.374 0.183 0.00001 0.00001 0.0191
## 5 0.00370 0.00001 0.0174 0.0301 0.00354 0.285 0.0677 0.0207 0.0496 0.00001
## 6 0.00001 0.0122 0.00001 0.226 0.00001 0.363 0.00001 0.00001 0.0140 0.0122
## # ℹ 8 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## # X16 <dbl>, X17 <dbl>, X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k18run1 <- k18run1 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
k18run1$index <- seq_len(nrow(k18run1))
# Perform the merge as before
df1 <-
merge(
k18run1,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.000010 0.00001 0.000010 0.014837 0.000010 0.020841 0.337713
## 311 SOC 1066 0.000010 0.00001 0.067510 0.000010 0.000010 0.691434 0.240906
## 312 SOC 1067 0.000010 0.00001 0.000010 0.792464 0.000010 0.127570 0.079816
## 313 SOC 1068 0.000010 0.00001 0.171399 0.000010 0.000790 0.374352 0.183001
## 314 SOC 1069 0.003705 0.00001 0.017397 0.030122 0.003544 0.285460 0.067722
## 315 SOC 1070 0.000010 0.01216 0.000010 0.226478 0.000010 0.362962 0.000010
## v8 v9 v10 v11 v12 v13 v14 v15 v16
## 310 0.000010 0.000010 0.000010 0.560465 0.066013 1e-05 0.000010 1e-05 0.000010
## 311 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 312 0.000010 0.000010 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 313 0.000010 0.000010 0.019120 0.229189 0.006437 1e-05 0.015612 1e-05 0.000010
## 314 0.020749 0.049572 0.000010 0.456702 0.053867 1e-05 0.011089 1e-05 0.000010
## 315 0.000010 0.014015 0.012198 0.256707 0.080054 1e-05 0.000010 1e-05 0.035326
## v17 v18 Latitude Longitude Pop_City Country Region Subregion
## 310 1e-05 1e-05 43.60042 39.74533 Sochi Russia Eastern Europe East Europe
## 311 1e-05 1e-05 43.60042 39.74533 Sochi Russia Eastern Europe East Europe
## 312 1e-05 1e-05 43.60042 39.74533 Sochi Russia Eastern Europe East Europe
## 313 1e-05 1e-05 43.60042 39.74533 Sochi Russia Eastern Europe East Europe
## 314 1e-05 1e-05 43.60042 39.74533 Sochi Russia Eastern Europe East Europe
## 315 1e-05 1e-05 43.60042 39.74533 Sochi Russia Eastern Europe East Europe
## Year order
## 310 2021 38
## 311 2021 38
## 312 2021 38
## 313 2021 38
## 314 2021 38
## 315 2021 38
Q-values for k=18
make a palette with 18 colors
colors2 <-c(
"v1" = "blue",
"v2" = "#B22222",
"v3" = "goldenrod",
"v4" = "orchid",
"v5" = "#F49AC2",
"v6" = "purple4",
"v7" = "purple",
"v8" = "#1E90FF",
"v9" = "#FF8C1A",
"v10" = "#77DD77",
"v11" = "#B20CD9",
"v12" = "yellow2",
"v13" = "green4",
"v14" = "chocolate4",
"v15" = "green",
"v16" = "#008080",
"v17" = "#75FAFF",
"v18" = "#FFFF99"
)
colors2
## v1 v2 v3 v4 v5 v6
## "blue" "#B22222" "goldenrod" "orchid" "#F49AC2" "purple4"
## v7 v8 v9 v10 v11 v12
## "purple" "#1E90FF" "#FF8C1A" "#77DD77" "#B20CD9" "yellow2"
## v13 v14 v15 v16 v17 v18
## "green4" "chocolate4" "green" "#008080" "#75FAFF" "#FFFF99"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k18_pie_all_countries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K13 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k13run3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.13.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k13run3 <- k13run3 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k13run3)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.215 0.00001 0.00001 0.00001 0.00001 0.0503 0.00001 0.00001 0.00776 0.0249
## 2 0.110 0.00001 0.00001 0.0133 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 3 0.914 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.0176 0.00001 0.0175 0.00001 0.00001 0.0294 0.00001 0.0278 0.0332 0.00463
## 5 0.0313 0.00932 0.0362 0.00001 0.0244 0.0580 0.00001 0.00001 0.00746 0.00001
## 6 0.194 0.00001 0.00001 0.00001 0.00001 0.0886 0.0143 0.0147 0.00001 0.0346
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k13run3 <- k13run3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
k13run3$index <- seq_len(nrow(k13run3))
# Perform the merge as before
df1 <-
merge(
k13run3,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.214763 0.000010 0.000010 0.00001 0.000010 0.050272 0.000010
## 311 SOC 1066 0.109712 0.000010 0.000010 0.01334 0.000010 0.000010 0.000010
## 312 SOC 1067 0.914099 0.000010 0.000010 0.00001 0.000010 0.000010 0.000010
## 313 SOC 1068 0.017640 0.000010 0.017477 0.00001 0.000010 0.029389 0.000010
## 314 SOC 1069 0.031304 0.009315 0.036161 0.00001 0.024405 0.058041 0.000010
## 315 SOC 1070 0.193658 0.000010 0.000010 0.00001 0.000010 0.088579 0.014296
## v8 v9 v10 v11 v12 v13 Latitude Longitude
## 310 0.000010 0.007757 0.024929 0.406487 0.295402 0.000330 43.60042 39.74533
## 311 0.000010 0.000010 0.000010 0.000010 0.876847 0.000010 43.60042 39.74533
## 312 0.000010 0.000010 0.000010 0.000010 0.085791 0.000010 43.60042 39.74533
## 313 0.027831 0.033217 0.004631 0.341965 0.524722 0.003088 43.60042 39.74533
## 314 0.000010 0.007463 0.000010 0.459753 0.373508 0.000010 43.60042 39.74533
## 315 0.014673 0.000010 0.034588 0.249920 0.404226 0.000010 43.60042 39.74533
## Pop_City Country Region Subregion Year order
## 310 Sochi Russia Eastern Europe East Europe 2021 38
## 311 Sochi Russia Eastern Europe East Europe 2021 38
## 312 Sochi Russia Eastern Europe East Europe 2021 38
## 313 Sochi Russia Eastern Europe East Europe 2021 38
## 314 Sochi Russia Eastern Europe East Europe 2021 38
## 315 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=13
make a palette with 13 colors
colors2 <-c(
"v1" = "purple4",
"v2" = "blue",
"v3" = "#FF8C1A",
"v4" = "green4",
"v5" = "#1E90FF",
"v6" = "yellow2",
"v7" = "#B22222",
"v8" = "#77DD77",
"v9" = "#F49AC2",
"v10" = "#008080",
"v11" = "#B20CD9",
"v12" = "purple",
"v13" = "green"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k13_pie_all_countries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K14 for admixture) Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k14run3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run3/r2_0.01_b.14.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k14run3 <- k14run3 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k14run3)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00106 0.0508 1 e-5 0.00001 1 e-5 0.407 0.00001 0.215 0.0251 0.00765
## 2 0.00001 0.00001 1 e-5 0.00001 1 e-5 0.00001 0.00001 0.110 0.00001 0.00001
## 3 0.00001 0.00001 1 e-5 0.00001 1 e-5 0.00001 0.00001 0.914 0.00001 0.00001
## 4 0.00001 0.0293 1.76e-4 0.00001 5.52e-4 0.339 0.00001 0.0180 0.00522 0.0341
## 5 0.00001 0.0567 1 e-5 0.0240 8.84e-3 0.460 0.0353 0.0308 0.00001 0.00513
## 6 0.00001 0.0871 1.31e-2 0.00001 1 e-5 0.250 0.00480 0.193 0.0331 0.00001
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
k14run3 <- k14run3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
k14run3$index <- seq_len(nrow(k14run3))
# Perform the merge as before
df1 <-
merge(
k14run3,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.00106 0.050765 0.000010 0.000010 0.000010 0.406851 0.000010
## 311 SOC 1066 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 312 SOC 1067 0.00001 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 313 SOC 1068 0.00001 0.029296 0.000176 0.000010 0.000552 0.339113 0.000010
## 314 SOC 1069 0.00001 0.056695 0.000010 0.023968 0.008836 0.459668 0.035294
## 315 SOC 1070 0.00001 0.087083 0.013070 0.000010 0.000010 0.250427 0.004795
## v8 v9 v10 v11 v12 v13 v14 Latitude
## 310 0.214582 0.025094 0.007646 0.000010 0.000013 0.293928 0.000010 43.60042
## 311 0.109817 0.000010 0.000010 0.013705 0.000010 0.876368 0.000010 43.60042
## 312 0.914203 0.000010 0.000010 0.000010 0.000010 0.085677 0.000010 43.60042
## 313 0.017951 0.005221 0.034125 0.000010 0.026843 0.523743 0.022941 43.60042
## 314 0.030823 0.000010 0.005126 0.000010 0.007622 0.371918 0.000010 43.60042
## 315 0.193435 0.033125 0.000010 0.000010 0.000010 0.402723 0.015282 43.60042
## Longitude Pop_City Country Region Subregion Year order
## 310 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 311 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 312 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 313 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 314 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
## 315 39.74533 Sochi Russia Eastern Europe East Europe 2021 38
Q-values for k=14
make a palette with 14 colors
colors2 <-c(
"v1" = "green",
"v2" = "yellow2",
"v3" = "#B22222",
"v4" = "#1E90FF",
"v5" = "blue",
"v6" = "#B20CD9",
"v7" = "#FF8C1A",
"v8" = "purple4",
"v9" = "#008080",
"v10" = "#F49AC2",
"v11" = "green4",
"v12" = "chocolate4",
"v13" = "purple",
"v14" = "#77DD77"
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/admixture/MAF_1/admixture_MAF1_r01_k14_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()