library(tidyverse) # many helpful things
library(here) # to load data easily
library(dplyr) # to manipulate data
library(ggplot2) # plots
library(extrafont) # probably won't work on openOnDemand on the clusters
library(hrbrthemes) # not really needed, personal preference for plots
library(officer) # export office format
library(here)
Using ggplot2 for individual admixtures #### 1.1.1 Extract ancestry coefficients for k=3
leak3 <- read_delim(
here("output/microsats/STRUCTURE/overlap_nopops_run_13_Qmatrix.csv"),
col_names = FALSE,
show_col_types = FALSE)
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.985 0.008 0.007
## 2 0.844 0.017 0.138
## 3 0.02 0.014 0.965
## 4 0.568 0.163 0.269
## 5 0.961 0.01 0.03
## 6 0.973 0.01 0.017
The population information
fam_file <- here("output/microsats/STRUCTURE/fam_file_data.txt")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID
## 1 FRS FRMH01
## 2 FRS FRMH02
## 3 FRS FRMH03
## 4 FRS FRMH04
## 5 FRS FRMH05
## 6 FRS FRMH06
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 FRMH01 FRS
## 2 FRMH02 FRS
## 3 FRMH03 FRS
## 4 FRMH04 FRS
## 5 FRMH05 FRS
## 6 FRMH06 FRS
Add it to the matrix
## ind pop X1 X2 X3
## 1 FRMH01 FRS 0.985 0.008 0.007
## 2 FRMH02 FRS 0.844 0.017 0.138
## 3 FRMH03 FRS 0.020 0.014 0.965
## 4 FRMH04 FRS 0.568 0.163 0.269
## 5 FRMH05 FRS 0.961 0.010 0.030
## 6 FRMH06 FRS 0.973 0.010 0.017
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak3)
## ind pop v1 v2 v3
## 1 FRMH01 FRS 0.985 0.008 0.007
## 2 FRMH02 FRS 0.844 0.017 0.138
## 3 FRMH03 FRS 0.020 0.014 0.965
## 4 FRMH04 FRS 0.568 0.163 0.269
## 5 FRMH05 FRS 0.961 0.010 0.030
## 6 FRMH06 FRS 0.973 0.010 0.017
Import Sample Locations
sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
source(
here("my_theme3.R")
)
# Melt the data frame for plotting
Q_melted <- leak3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#77DD77",
"yellow2",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:3)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n STRUCTURE inference with 11 microsats for 24 locations.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Using ggplot2 for individual admixtures #### 1.2.1 Extract ancestry coefficients for k=10
leak10 <- read_delim(
here("output/microsats/STRUCTURE/overlap_nopops_run_82_Qmatrix.csv"),
col_names = FALSE,
show_col_types = FALSE)
head(leak10)
## # A tibble: 6 × 10
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.932 0.011 0.004 0.005 0.005 0.016 0.012 0.004 0.005 0.005
## 2 0.199 0.012 0.004 0.007 0.04 0.618 0.049 0.006 0.007 0.058
## 3 0.046 0.011 0.012 0.031 0.071 0.046 0.017 0.017 0.009 0.741
## 4 0.034 0.007 0.007 0.007 0.897 0.013 0.005 0.011 0.014 0.006
## 5 0.538 0.031 0.008 0.011 0.043 0.004 0.013 0.005 0.003 0.344
## 6 0.894 0.008 0.009 0.026 0.012 0.021 0.005 0.009 0.01 0.005
The population information
fam_file <- here("output/microsats/STRUCTURE/fam_file_data.txt")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID
## 1 FRS FRMH01
## 2 FRS FRMH02
## 3 FRS FRMH03
## 4 FRS FRMH04
## 5 FRS FRMH05
## 6 FRS FRMH06
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 FRMH01 FRS
## 2 FRMH02 FRS
## 3 FRMH03 FRS
## 4 FRMH04 FRS
## 5 FRMH05 FRS
## 6 FRMH06 FRS
Add it to the matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 FRMH01 FRS 0.932 0.011 0.004 0.005 0.005 0.016 0.012 0.004 0.005 0.005
## 2 FRMH02 FRS 0.199 0.012 0.004 0.007 0.040 0.618 0.049 0.006 0.007 0.058
## 3 FRMH03 FRS 0.046 0.011 0.012 0.031 0.071 0.046 0.017 0.017 0.009 0.741
## 4 FRMH04 FRS 0.034 0.007 0.007 0.007 0.897 0.013 0.005 0.011 0.014 0.006
## 5 FRMH05 FRS 0.538 0.031 0.008 0.011 0.043 0.004 0.013 0.005 0.003 0.344
## 6 FRMH06 FRS 0.894 0.008 0.009 0.026 0.012 0.021 0.005 0.009 0.010 0.005
Rename the columns
# Rename the columns starting from the third one
leak10 <- leak10 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak10)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 FRMH01 FRS 0.932 0.011 0.004 0.005 0.005 0.016 0.012 0.004 0.005 0.005
## 2 FRMH02 FRS 0.199 0.012 0.004 0.007 0.040 0.618 0.049 0.006 0.007 0.058
## 3 FRMH03 FRS 0.046 0.011 0.012 0.031 0.071 0.046 0.017 0.017 0.009 0.741
## 4 FRMH04 FRS 0.034 0.007 0.007 0.007 0.897 0.013 0.005 0.011 0.014 0.006
## 5 FRMH05 FRS 0.538 0.031 0.008 0.011 0.043 0.004 0.013 0.005 0.003 0.344
## 6 FRMH06 FRS 0.894 0.008 0.009 0.026 0.012 0.021 0.005 0.009 0.010 0.005
Import Sample Locations
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/microsats/STRUCTURE/sampling_loc_euro_microsats.rds"))
head(sampling_loc)
## Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1 Limbe Cameroon 4.023252 9.194771 Africa LIB 2018
## 2 Morondava Madagascar -20.284200 44.279400 Africa MAD 2016
## 3 Trois-Bassins Reunion -21.109008 55.319206 Africa TRO 2017
## 4 Dauguet Mauritius -20.185300 57.521540 Africa DAU 2022
## 5 Franceville Gabon -1.592070 13.532420 Africa GAB 2015
## 6 Antananarivo Madagascar -18.879200 47.507900 Africa ANT 2022
## Region Subregion order order2 orderold order_microsat microsats
## 1 West Africa Africa 79 73 71 NA
## 2 East Africa East Africa 80 78 72 NA
## 3 Indian Ocean Indian Ocean 81 81 73 NA
## 4 Indian Ocean Indian Ocean 82 80 74 NA
## 5 Central Africa Africa NA 72 NA NA
## 6 East Africa East Africa NA 76 NA NA
## microsat_code alt_code X Country.1
## 1 NA Cameroon
## 2 NA Madagascar
## 3 NA Reunion
## 4 NA Mauritius
## 5 NA Gabon
## 6 NA Madagascar
source(
here("my_theme3.R")
)
# Melt the data frame for plotting
Q_melted <- leak10 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order_microsat, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"darkgreen",
"#1E90FF",
"cyan",
"yellow2",
"#F49AC2",
"#77DD77",
"magenta",
"red3",
"#FFB347",
"#FF8C1A")
# Generate all potential variable names
all_variables <- paste0("v", 1:10)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=10.\n STRUCTURE inference with 11 microsats for 24 locations.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
#data<-read.csv("sampling_loc_euro_only_microsats.csv", stringsAsFactors = TRUE)
#write_rds(data, "sampling_loc_euro_only_microsats.rds")
sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Year, order)
head(pops)
## # A tibble: 6 × 8
## Abbreviation Latitude Longitude Pop_City Country Region Year order
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 FRS 45.2 5.77 Saint-Martin-d'Her… France Weste… 2019 9
## 2 STS 48.6 7.75 Strasbourg France Weste… 2019 10
## 3 POP 41.2 -8.33 Penafiel Portug… South… 2017 11
## 4 SPB 38.9 -6.97 Badajoz Spain South… 2018 13
## 5 SPS 36.2 -5.37 San Roque Spain South… 2017 14
## 6 SPC 39.4 -0.396 Catarroja Spain South… 2017 15
Merge with pops
# Add an index column to Q_tibble
leak3$index <- seq_len(nrow(leak3))
# Perform the merge as before
df1 <-
merge(
leak3,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 Latitude Longitude Pop_City
## 117 FRS FRMH01 0.985 0.008 0.007 45.16531 5.771806 Saint-Martin-d'Heres
## 118 FRS FRMH02 0.844 0.017 0.138 45.16531 5.771806 Saint-Martin-d'Heres
## 119 FRS FRMH03 0.020 0.014 0.965 45.16531 5.771806 Saint-Martin-d'Heres
## 120 FRS FRMH04 0.568 0.163 0.269 45.16531 5.771806 Saint-Martin-d'Heres
## 121 FRS FRMH05 0.961 0.010 0.030 45.16531 5.771806 Saint-Martin-d'Heres
## 122 FRS FRMH06 0.973 0.010 0.017 45.16531 5.771806 Saint-Martin-d'Heres
## Country Region Year order
## 117 France Western Europe 2019 9
## 118 France Western Europe 2019 9
## 119 France Western Europe 2019 9
## 120 France Western Europe 2019 9
## 121 France Western Europe 2019 9
## 122 France Western Europe 2019 9
## Linking to GEOS 3.9.1, GDAL 3.2.1, PROJ 7.2.1; sf_use_s2() is TRUE
##
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("my_theme2.R")
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggsave(
here("STRUCTURE/overlap/results/Structure_k3_microsats_pie_overlap_allcountries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("my_theme2.R")
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3"),
color = NA) +
scale_fill_manual(values = color_palette) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Using ggplot2 for individual admixtures
leak3 <- read_delim(
here("output/microsats/STRUCTURE/overlap_microsats_usepoploc_run_13_Qmatrix.csv"),
col_names = FALSE,
show_col_types = FALSE)
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.994 0.001 0.005
## 2 0.896 0.003 0.101
## 3 0.235 0.003 0.763
## 4 0.598 0.031 0.37
## 5 0.965 0.001 0.034
## 6 0.985 0.002 0.013
The population information
fam_file <- here("output/microsats/STRUCTURE/overlap_loc_prior/fam_file_data.txt")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID
## 1 FRS FRMH01
## 2 FRS FRMH02
## 3 FRS FRMH03
## 4 FRS FRMH04
## 5 FRS FRMH05
## 6 FRS FRMH06
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 FRMH01 FRS
## 2 FRMH02 FRS
## 3 FRMH03 FRS
## 4 FRMH04 FRS
## 5 FRMH05 FRS
## 6 FRMH06 FRS
Add it to the matrix
## ind pop X1 X2 X3
## 1 FRMH01 FRS 0.994 0.001 0.005
## 2 FRMH02 FRS 0.896 0.003 0.101
## 3 FRMH03 FRS 0.235 0.003 0.763
## 4 FRMH04 FRS 0.598 0.031 0.370
## 5 FRMH05 FRS 0.965 0.001 0.034
## 6 FRMH06 FRS 0.985 0.002 0.013
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak3)
## ind pop v1 v2 v3
## 1 FRMH01 FRS 0.994 0.001 0.005
## 2 FRMH02 FRS 0.896 0.003 0.101
## 3 FRMH03 FRS 0.235 0.003 0.763
## 4 FRMH04 FRS 0.598 0.031 0.370
## 5 FRMH05 FRS 0.965 0.001 0.034
## 6 FRMH06 FRS 0.985 0.002 0.013
Import Sample Locations
sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
source(
here("my_theme3.R")
)
# Melt the data frame for plotting
Q_melted <- leak3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"yellow2",
"#FF8C1A",
"#77DD77"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:3)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n STRUCTURE inference using population data with 11 microsats for 24 locations.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Import samples attributes
#data<-read.csv("sampling_loc_euro_only_microsats.csv", stringsAsFactors = TRUE)
#write_rds(data, "sampling_loc_euro_only_microsats.rds")
sampling_loc <- readRDS(here("output/microsats/STRUCTURE/sampling_loc_overlap.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Year, order)
head(pops)
## # A tibble: 6 × 8
## Abbreviation Latitude Longitude Pop_City Country Region Year order
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 FRS 45.2 5.77 Saint-Martin-d'Her… France Weste… 2019 9
## 2 STS 48.6 7.75 Strasbourg France Weste… 2019 10
## 3 POP 41.2 -8.33 Penafiel Portug… South… 2017 11
## 4 SPB 38.9 -6.97 Badajoz Spain South… 2018 13
## 5 SPS 36.2 -5.37 San Roque Spain South… 2017 14
## 6 SPC 39.4 -0.396 Catarroja Spain South… 2017 15
Merge with pops
# Add an index column to Q_tibble
leak3$index <- seq_len(nrow(leak3))
# Perform the merge as before
df1 <-
merge(
leak3,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 Latitude Longitude Pop_City
## 117 FRS FRMH01 0.994 0.001 0.005 45.16531 5.771806 Saint-Martin-d'Heres
## 118 FRS FRMH02 0.896 0.003 0.101 45.16531 5.771806 Saint-Martin-d'Heres
## 119 FRS FRMH03 0.235 0.003 0.763 45.16531 5.771806 Saint-Martin-d'Heres
## 120 FRS FRMH04 0.598 0.031 0.370 45.16531 5.771806 Saint-Martin-d'Heres
## 121 FRS FRMH05 0.965 0.001 0.034 45.16531 5.771806 Saint-Martin-d'Heres
## 122 FRS FRMH06 0.985 0.002 0.013 45.16531 5.771806 Saint-Martin-d'Heres
## Country Region Year order
## 117 France Western Europe 2019 9
## 118 France Western Europe 2019 9
## 119 France Western Europe 2019 9
## 120 France Western Europe 2019 9
## 121 France Western Europe 2019 9
## 122 France Western Europe 2019 9
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("my_theme2.R")
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()