R Markdown

Load libraries needed for fastStructure

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.5.1     ✔ purrr   1.0.2
## ✔ tibble  3.2.1     ✔ dplyr   1.1.4
## ✔ tidyr   1.3.1     ✔ stringr 1.5.1
## ✔ readr   2.1.5     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)               # to load data easily
## here() starts at /gpfs/gibbs/pi/caccone/mkc54/albo
#library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
## Registering fonts with R
library(hrbrthemes)         # not really needed, personal preference for plots
#library(hrbrmisc)           # not really needed, personal preference for plots
# library(stringr)          # for strings operations
#library(ggstatsplot)        # statistics and plotting
library(flextable)          # create tables
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:purrr':
## 
##     compose
library(officer)            # export office format
library(here)

Euro_global dataset structure analyses

1. SNP Set 2: MAF 10% Euro_global dataset structure analyses

1.1 Run Simple prior in fastStructure for file with LD pruning with r2=0.1 (Set 2)

1.1.1 Check populations in file

awk '{print $1}' euro_global/output/snps_sets/r2_0.1.fam | sort | uniq -c | awk '{print $2}' 
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GEL
## GES
## GRA
## GRC
## GRV
## HAI
## HAN
## HOC
## HUN
## IMP
## INJ
## INW
## ITB
## ITP
## ITR
## JAF
## KAC
## KAG
## KAN
## KAT
## KER
## KLP
## KRA
## KUN
## LAM
## MAL
## MAT
## OKI
## PAL
## POL
## POP
## QNC
## RAR
## REC
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SON
## SPB
## SPC
## SPM
## SPS
## SSK
## STS
## SUF
## SUU
## TAI
## TIK
## TIR
## TRE
## TUA
## TUH
## UTS
## YUN

Count the total number of population

awk '{print $1}' euro_global/output/snps_sets/r2_0.1.fam | sort | uniq -c | wc -l
#73
## 73

1.1.2 Create a directory for each run.

cd euro_global/output/fastStructure/r_1
for i in $(seq -w 1 100)
do
  mkdir run$i
done
#ls #should have 100 folders now

1.1.3 Create the dsq files

cd euro_global/output/fastStructure/r_1
module load dSQ/1.05 

for k in $(seq 1 40); do
    for run in $(seq -w 1 100); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.1 --output=/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run$run/simple --full --cv=10 --tol=10e-6"
    done
done > dsq_euro_global_r_1.txt

head dsq_euro_global_r_1.txt #check file

dsq --job-file dsq_euro_global_r_1.txt --output /dev/null --mail-type ALL -t 24:00:00 --partition=ycga --cpus-per-task=10 --job-name fS.simple_euro_global_r_1 --batch-file simple_euro_global_r_1.sh

#Check the files to see if they have 4,000 jobs:
head simple_euro_global_r_1.sh

1.1.4 Run the jobs

Run a random job from our array to see if works

sbatch --array=$((1 + $RANDOM % 1000)) simple_euro_global_r_1.sh

# check status
dsqa -j 15462647 #insert job number

#if it looks like its running ok, you can cancel it 
scancel 15462647

Submit all jobs.

sbatch simple_euro_global_r_1.sh
#Submitted batch job # 

#Check it
dsqa -j 15464508 #add job number here

1.1.5 Run autopsy once its done

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1
module load dSQ/1.05

dsqa -j 15464508 -f dsq_euro_global_r_1.txt -s TIMEOUT > re-run_jobs1.txt 2> 15464508_report1.txt; head 15464508_report1.txt
head re-run_jobs1.txt  #all completed

1.1.6. Find the optimal number of K.

you will see it varies from run to run. You can estimate mean, mode and median values. Then you choose one that makes sense.

#choose k
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session

module load miniconda
conda activate fastStructure 

#First check files
ls run*/* | wc -l # should be 20000

# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
   #      --input=<file>

for i in cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run*
do
    echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done

Get outputs in a file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1

cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run*
do
    echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' |  awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_euro_global_r0.1_40k.txt

head simple_euro_global_r0.1_40k.txt #check the file
#ok

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1
cp -r simple_euro_global_r0.1_40k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global #copy file 

1.1.7 Make a plot with the Ks.

Let’s import the data in R.

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
#library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
library(hrbrthemes)         # not really needed, personal preference for plots
#library(hrbrmisc)           # not really needed, personal preference for plots
# library(stringr)          # for strings operations
#library(ggstatsplot)        # statistics and plotting
library(flextable)          # create tables
library(officer)            # export office format
library(here)

Import fastStructure

# function to import our choosek.py data
import_fastStructure <- function(file) {
  # import as a tibble and set columns as integers
  dat <- read_delim(
    file,
    col_names      = TRUE,
    show_col_types = FALSE,
    col_types      = "iii"
  )

  # get columns we need and make it long for plotting
  dat <- dat |>
    gather(
      structure, likelihood, -run
    )

  # rename the columns by index
  dat <- dat |>
    rename(
      run   = 1,
      model = 2,
      k     = 3
    )
  return(dat)
}

Run import function for simple prior models

choose_k_simple <- 
  import_fastStructure(
  here("scripts", "RMarkdowns", "output", "euro_global", "simple_euro_global_r0.1_40k.txt"
  )
)

Function to plot fastStructure choosek.py results

# function to plot our choosek.py data
plot_fastStructure <- function(df) {
  df |>
    ggplot() +
    geom_line(
      aes(
        x              = run,
        y              = k,
        color          = model
      ),
      linewidth = 1
    ) +
    scale_colour_manual(
      "model",
      values = c(
        structure      = "#9d60ff",
        likelihood     = "#ffc08c"
      ),
      labels = c(
        "Maximizes \n Likelihood \n", "Explain \n Structure"
      )
    ) +
    labs(
      x                = "Run",
      y                = "K",
      title            = "fastStructure simple for Europe, Asia & the Americas (r2<0.1)",
      caption          = "algorithm runs for K ranging from 1 to 40"
    ) +
    theme(
      panel.grid.major = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      panel.grid.minor = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      legend.text = element_text(
        size = 12
      ),
      legend.title = element_text(
        size           = 14,
        face           = "bold"
      ),
      legend.position = "right"
    )
}

Use our function to plot k values

plot_fastStructure(
  choose_k_simple
) +
  labs(
    subtitle = "simple fastStructure for Europe, Asia & the Americas (r2<0.1)"
  )

# save plot
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "fastStructure", "fastStructure_simple_k40_euro_global_r0.1.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

1.1.8 Find mean K maximizing likelihood

choose_k_likelihood <- subset(choose_k_simple, model != "structure")
mean(choose_k_likelihood$k) #22.47
## [1] 22.47
median(choose_k_likelihood$k) #22
## [1] 22
common <- table(choose_k_likelihood$k)
common #22
## 
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
##  4  1 10  9  5  8 15 12  8  9  4  7  4  1  3

2. Plots for fastStructure simple for Set 2 (r2<0.1) of the euro_global dataset

distinct_palette_20 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "#1E90FF",
    "#B22222",
    "#AE8333",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "purple",
    "orangered"
  )

Using ggplot2 for individual admixtures

2.1 k=20 plot

2.1.1 Extract ancestry coefficients for k=20

change to correct run & Q matrix

fask20 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run005/simple.20.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask20)
## # A tibble: 6 × 20
##       X1    X2    X3    X4    X5    X6    X7     X8     X9   X10   X11   X12
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 0          0 0         0     0 0.546     0 0      0.0315 0.413     0     0
## 2 0.0702     0 0.154     0     0 0.330     0 0.0485 0.0673 0.330     0     0
## 3 0          0 0         0     0 1.00      0 0      0      0         0     0
## 4 0          0 0         0     0 1.00      0 0      0      0         0     0
## 5 0          0 0         0     0 1.00      0 0      0      0         0     0
## 6 0          0 0         0     0 1.00      0 0      0      0         0     0
## # ℹ 8 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.1.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask20 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask20)

head(fask20)
##    ind pop      X1 X2       X3 X4 X5       X6 X7      X8       X9      X10 X11
## 1 1001 OKI 0.00000  0 0.000000  0  0 0.545698  0 0.00000 0.031465 0.413306   0
## 2 1002 OKI 0.07021  0 0.154226  0  0 0.330192  0 0.04849 0.067308 0.329567   0
## 3 1003 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
## 4 1004 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
## 5 1005 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
## 6 1006 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
##   X12 X13 X14 X15 X16      X17 X18 X19 X20
## 1   0   0   0   0   0 0.009523   0   0   0
## 2   0   0   0   0   0 0.000000   0   0   0
## 3   0   0   0   0   0 0.000000   0   0   0
## 4   0   0   0   0   0 0.000000   0   0   0
## 5   0   0   0   0   0 0.000000   0   0   0
## 6   0   0   0   0   0 0.000000   0   0   0

Rename the columns

# Rename the columns starting from the third one
fask20 <- fask20 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask20)
##    ind pop      v1 v2       v3 v4 v5       v6 v7      v8       v9      v10 v11
## 1 1001 OKI 0.00000  0 0.000000  0  0 0.545698  0 0.00000 0.031465 0.413306   0
## 2 1002 OKI 0.07021  0 0.154226  0  0 0.330192  0 0.04849 0.067308 0.329567   0
## 3 1003 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
## 4 1004 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
## 5 1005 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
## 6 1006 OKI 0.00000  0 0.000000  0  0 0.999991  0 0.00000 0.000000 0.000000   0
##   v12 v13 v14 v15 v16      v17 v18 v19 v20
## 1   0   0   0   0   0 0.009523   0   0   0
## 2   0   0   0   0   0 0.000000   0   0   0
## 3   0   0   0   0   0 0.000000   0   0   0
## 4   0   0   0   0   0 0.000000   0   0   0
## 5   0   0   0   0   0 0.000000   0   0   0
## 6   0   0   0   0   0 0.000000   0   0   0

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
2.1.2 Using ggplot2 for individual admixtures K=20
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask20 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))


# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette_20 <-
  c(
    "#FF8C1A",
    "#B20CC9",
    "#77DD77",
    "green",
    "#B22222",
    "orangered",
    "yellow2",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "#AE8333",
    "purple",
    "#1E90FF"
      )

# Generate all potential variable names
all_variables <- paste0("v", 1:20)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = distinct_palette_20[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n fastStructure inference for k20 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_20) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "fastStructure_k=20_euro_global_r2_1_run5.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.2 k=5 plot

Try plotting k=5 (See if the 5 original Asian clusters hold)

2.2.1 Extract ancestry coefficients for k=5

choose best run here

fask5 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run001/simple.5.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(fask5)
## # A tibble: 6 × 5
##         X1    X2     X3    X4    X5
##      <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1 0.000002 0.195 0.116  0.166 0.523
## 2 0.00172  0.220 0.122  0.150 0.506
## 3 0.000002 0.197 0.118  0.135 0.551
## 4 0.000002 0.201 0.0905 0.202 0.506
## 5 0.000002 0.196 0.104  0.186 0.514
## 6 0.00182  0.206 0.108  0.153 0.531

The fam file

fam_file <- here(
  "euro_global/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask5)

head(fask5)
##    ind pop       X1       X2       X3       X4       X5
## 1 1001 OKI 0.000002 0.195295 0.116209 0.165633 0.522861
## 2 1002 OKI 0.001716 0.219744 0.122377 0.149972 0.506191
## 3 1003 OKI 0.000002 0.196637 0.117934 0.134634 0.550793
## 4 1004 OKI 0.000002 0.201285 0.090451 0.202476 0.505786
## 5 1005 OKI 0.000002 0.195789 0.104347 0.185696 0.514167
## 6 1006 OKI 0.001818 0.206088 0.107509 0.153097 0.531488

Rename the columns

# Rename the columns starting from the third one
fask5 <- fask5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask5)
##    ind pop       v1       v2       v3       v4       v5
## 1 1001 OKI 0.000002 0.195295 0.116209 0.165633 0.522861
## 2 1002 OKI 0.001716 0.219744 0.122377 0.149972 0.506191
## 3 1003 OKI 0.000002 0.196637 0.117934 0.134634 0.550793
## 4 1004 OKI 0.000002 0.201285 0.090451 0.202476 0.505786
## 5 1005 OKI 0.000002 0.195789 0.104347 0.185696 0.514167
## 6 1006 OKI 0.001818 0.206088 0.107509 0.153097 0.531488

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
2.2.2 Using ggplot2 for individual admixtures K=5
source(
  here("scripts", "RMarkdowns", 
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 


color_palette_5 <-
  c(
    "#FF8C1A",
    "#1E90FF",
    "#FFFF19",
    "#77DD37",
    "purple3"
     )


# Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_5[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 10
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n fastStructure inference for k5 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_5) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "fastStructure_k=5_euro_global_r2_1_run1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.3 k=22 plot

Plots for fastStructure k=22 for r2<0.1 (Set 2)

color_palette_22 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "#1E90FF",
    "#B22222",
    "chocolate4",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "purple",
    "orangered2",
    "goldenrod3",
    "coral")
2.3.1 Extract ancestry coefficients for k=22

change to correct run for matrix

fask22 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run004/simple.22.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask22)
## # A tibble: 6 × 22
##      X1    X2    X3     X4    X5    X6     X7    X8    X9   X10   X11   X12
##   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     0     0     0 0.0332     0     0 0          0     0     0     0 0.627
## 2     0     0     0 0.0957     0     0 0.0719     0     0     0     0 0.449
## 3     0     0     0 0          0     0 0          0     0     0     0 1.00 
## 4     0     0     0 0          0     0 0          0     0     0     0 1.00 
## 5     0     0     0 0          0     0 0          0     0     0     0 1.00 
## 6     0     0     0 0          0     0 0          0     0     0     0 1.00 
## # ℹ 10 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.1.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask22 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask22)

head(fask22)
##    ind pop X1 X2 X3       X4 X5 X6       X7 X8 X9 X10 X11      X12 X13      X14
## 1 1001 OKI  0  0  0 0.033221  0  0 0.000000  0  0   0   0 0.627499   0 0.000000
## 2 1002 OKI  0  0  0 0.095747  0  0 0.071857  0  0   0   0 0.448538   0 0.109662
## 3 1003 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
## 4 1004 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
## 5 1005 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
## 6 1006 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
##   X15      X16 X17 X18      X19      X20 X21 X22
## 1   0 0.000000   0   0 0.213558 0.125714   0   0
## 2   0 0.015617   0   0 0.258573 0.000000   0   0
## 3   0 0.000000   0   0 0.000000 0.000000   0   0
## 4   0 0.000000   0   0 0.000000 0.000000   0   0
## 5   0 0.000000   0   0 0.000000 0.000000   0   0
## 6   0 0.000000   0   0 0.000000 0.000000   0   0

Rename the columns

# Rename the columns starting from the third one
fask22 <- fask22 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask22)
##    ind pop v1 v2 v3       v4 v5 v6       v7 v8 v9 v10 v11      v12 v13      v14
## 1 1001 OKI  0  0  0 0.033221  0  0 0.000000  0  0   0   0 0.627499   0 0.000000
## 2 1002 OKI  0  0  0 0.095747  0  0 0.071857  0  0   0   0 0.448538   0 0.109662
## 3 1003 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
## 4 1004 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
## 5 1005 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
## 6 1006 OKI  0  0  0 0.000000  0  0 0.000000  0  0   0   0 0.999991   0 0.000000
##   v15      v16 v17 v18      v19      v20 v21 v22
## 1   0 0.000000   0   0 0.213558 0.125714   0   0
## 2   0 0.015617   0   0 0.258573 0.000000   0   0
## 3   0 0.000000   0   0 0.000000 0.000000   0   0
## 4   0 0.000000   0   0 0.000000 0.000000   0   0
## 5   0 0.000000   0   0 0.000000 0.000000   0   0
## 6   0 0.000000   0   0 0.000000 0.000000   0   0

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask22 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette_22 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "#1E90FF",
    "#B22222",
    "chocolate4",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "purple",
    "orangered2",
    "goldenrod3",
    "coral")

# Generate all potential variable names
all_variables <- paste0("v", 1:22)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_22[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=22.\n fastStructure inference for k22 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_22) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "faststructure_k=22_euro_global_r2_1_run4.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.4 k=6 plots

2.4.1 Extract ancestry coefficients for k6

choose best run here

fask6 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run001/simple.6.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(fask6)
## # A tibble: 6 × 6
##      X1       X2       X3    X4    X5       X6
##   <dbl>    <dbl>    <dbl> <dbl> <dbl>    <dbl>
## 1 0.197 0.0302   0.0855   0.539 0.147 0.000356
## 2 0.218 0.0359   0.0944   0.530 0.118 0.00317 
## 3 0.193 0.00744  0.0844   0.585 0.126 0.00444 
## 4 0.215 0.0974   0.000002 0.566 0.122 0.000002
## 5 0.212 0.0853   0.000002 0.583 0.120 0.000002
## 6 0.207 0.000002 0.0737   0.572 0.144 0.00366

The fam file

fam_file <- here(
  "euro_global/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask6 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask6)

head(fask6)
##    ind pop       X1       X2       X3       X4       X5       X6
## 1 1001 OKI 0.197202 0.030164 0.085491 0.539402 0.147385 0.000356
## 2 1002 OKI 0.218262 0.035912 0.094369 0.530318 0.117972 0.003167
## 3 1003 OKI 0.192715 0.007444 0.084413 0.584812 0.126175 0.004441
## 4 1004 OKI 0.214797 0.097426 0.000002 0.565501 0.122273 0.000002
## 5 1005 OKI 0.211791 0.085347 0.000002 0.583273 0.119587 0.000002
## 6 1006 OKI 0.207172 0.000002 0.073748 0.571612 0.143808 0.003659

Rename the columns

# Rename the columns starting from the third one
fask6 <- fask6 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask6)
##    ind pop       v1       v2       v3       v4       v5       v6
## 1 1001 OKI 0.197202 0.030164 0.085491 0.539402 0.147385 0.000356
## 2 1002 OKI 0.218262 0.035912 0.094369 0.530318 0.117972 0.003167
## 3 1003 OKI 0.192715 0.007444 0.084413 0.584812 0.126175 0.004441
## 4 1004 OKI 0.214797 0.097426 0.000002 0.565501 0.122273 0.000002
## 5 1005 OKI 0.211791 0.085347 0.000002 0.583273 0.119587 0.000002
## 6 1006 OKI 0.207172 0.000002 0.073748 0.571612 0.143808 0.003659

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
2.4.2 Using ggplot2 for individual admixtures K=6
source(
  here("scripts", "RMarkdowns", 
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask6 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 


color_palette_6 <-
    c(
    "#77DD37",  
    "#1E90FF",
    "red",    
    "#FF8C1A",
    "purple3",    
    "#FFFF19"
    )

# Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                        color = color_palette_6[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 10
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n fastStructure inference for k6 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_6) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "fastStructure_k=6_euro_global_r2_1_run1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.5 K=7

2.5.1 Extract ancestry coefficients for k7

choose best run here

fask7 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run001/simple.7.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(fask7)
## # A tibble: 6 × 7
##         X1       X2    X3       X4       X5    X6    X7
##      <dbl>    <dbl> <dbl>    <dbl>    <dbl> <dbl> <dbl>
## 1 0.000001 0.398    0.187 0.000001 0.000001 0.225 0.190
## 2 0.000001 0.374    0.149 0.105    0.000001 0.233 0.138
## 3 0.00241  0.000001 0.269 0.000001 0.000001 0.305 0.424
## 4 0.000305 0.338    0.216 0.0753   0.000001 0.217 0.153
## 5 0.000115 0.322    0.201 0.0872   0.000001 0.212 0.177
## 6 0.00178  0.000001 0.234 0.132    0.000001 0.268 0.364

The fam file

fam_file <- here(
  "euro_global/output/snps_sets/r2_0.1.fam"
)

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask7 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask7)

head(fask7)
##    ind pop       X1       X2       X3       X4    X5       X6       X7
## 1 1001 OKI 0.000001 0.398283 0.187190 0.000001 1e-06 0.224832 0.189691
## 2 1002 OKI 0.000001 0.374411 0.149325 0.105207 1e-06 0.233370 0.137684
## 3 1003 OKI 0.002406 0.000001 0.269369 0.000001 1e-06 0.304655 0.423566
## 4 1004 OKI 0.000305 0.338142 0.216241 0.075254 1e-06 0.216667 0.153390
## 5 1005 OKI 0.000115 0.322164 0.201010 0.087225 1e-06 0.212258 0.177226
## 6 1006 OKI 0.001777 0.000001 0.234360 0.132068 1e-06 0.267600 0.364193

Rename the columns

# Rename the columns starting from the third one
fask7 <- fask7 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask7)
##    ind pop       v1       v2       v3       v4    v5       v6       v7
## 1 1001 OKI 0.000001 0.398283 0.187190 0.000001 1e-06 0.224832 0.189691
## 2 1002 OKI 0.000001 0.374411 0.149325 0.105207 1e-06 0.233370 0.137684
## 3 1003 OKI 0.002406 0.000001 0.269369 0.000001 1e-06 0.304655 0.423566
## 4 1004 OKI 0.000305 0.338142 0.216241 0.075254 1e-06 0.216667 0.153390
## 5 1005 OKI 0.000115 0.322164 0.201010 0.087225 1e-06 0.212258 0.177226
## 6 1006 OKI 0.001777 0.000001 0.234360 0.132068 1e-06 0.267600 0.364193

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
2.5.2 Using ggplot2 for individual admixtures K=7
source(
  here("scripts", "RMarkdowns", 
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask7 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 


color_palette_7 <-
  c(
    "#1E90FF",       
    "#75FAFF",     
    "#FF8C1A",        
    "#FFFF19",
    "purple3",
    "#77DD37",    
    "red")


# Generate all potential variable names
all_variables <- paste0("v", 1:7)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_7[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 10
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=7.\n fastStructure inference for k7 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_7) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "fastStructure_k=7_euro_global_r2_1_run1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.6 K=21 plots

Plots for fastStructure k=21 for r2<0.1

color_palette_21 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "chocolate4",
    "#B22222",
    "purple",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#1E90FF",
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "#008080",
    "goldenrod3",
    "orangered2"
    )
2.6.1 Extract ancestry coefficients for k=21

change to correct run for matrix

fask21 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run004/simple.21.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask21)
## # A tibble: 6 × 21
##      X1    X2    X3    X4    X5    X6    X7    X8     X9    X10   X11   X12
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>
## 1     0     0 0     0         0     0     0     0 0.0355 0          0 0.612
## 2     0     0 0.154 0.166     0     0     0     0 0      0.0983     0 0.437
## 3     0     0 0     0         0     0     0     0 0      0          0 1.00 
## 4     0     0 0     0         0     0     0     0 0      0          0 1.00 
## 5     0     0 0     0         0     0     0     0 0      0          0 1.00 
## 6     0     0 0     0         0     0     0     0 0      0          0 1.00 
## # ℹ 9 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>

The fam file

fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.1.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask21 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask21)

head(fask21)
##    ind pop X1 X2       X3       X4 X5 X6 X7 X8       X9      X10 X11      X12
## 1 1001 OKI  0  0 0.000000 0.000000  0  0  0  0 0.035524 0.000000   0 0.612431
## 2 1002 OKI  0  0 0.153738 0.166292  0  0  0  0 0.000000 0.098323   0 0.437370
## 3 1003 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
## 4 1004 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
## 5 1005 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
## 6 1006 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
##   X13      X14 X15      X16      X17 X18 X19 X20 X21
## 1   0 0.000000   0 0.352037 0.000000   0   0   0   0
## 2   0 0.037691   0 0.000000 0.106579   0   0   0   0
## 3   0 0.000000   0 0.000000 0.000000   0   0   0   0
## 4   0 0.000000   0 0.000000 0.000000   0   0   0   0
## 5   0 0.000000   0 0.000000 0.000000   0   0   0   0
## 6   0 0.000000   0 0.000000 0.000000   0   0   0   0

Rename the columns

# Rename the columns starting from the third one
fask21 <- fask21 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask21)
##    ind pop v1 v2       v3       v4 v5 v6 v7 v8       v9      v10 v11      v12
## 1 1001 OKI  0  0 0.000000 0.000000  0  0  0  0 0.035524 0.000000   0 0.612431
## 2 1002 OKI  0  0 0.153738 0.166292  0  0  0  0 0.000000 0.098323   0 0.437370
## 3 1003 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
## 4 1004 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
## 5 1005 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
## 6 1006 OKI  0  0 0.000000 0.000000  0  0  0  0 0.000000 0.000000   0 0.999991
##   v13      v14 v15      v16      v17 v18 v19 v20 v21
## 1   0 0.000000   0 0.352037 0.000000   0   0   0   0
## 2   0 0.037691   0 0.000000 0.106579   0   0   0   0
## 3   0 0.000000   0 0.000000 0.000000   0   0   0   0
## 4   0 0.000000   0 0.000000 0.000000   0   0   0   0
## 5   0 0.000000   0 0.000000 0.000000   0   0   0   0
## 6   0 0.000000   0 0.000000 0.000000   0   0   0   0

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask21 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette_21 <-
  c(
    "chocolate4",
    "green4",
    "#1E90FF",
    "#B20CC9",
    "navy",     
    "#75FAFF",    
    "#008080",
    "#77DD77",
    "magenta",   
    "orangered2",       
    "purple",
    "yellow2",
    "goldenrod3",
    "green",
    "#FF8C1A",
    "#AE9393",
    "purple4",
    "#B22222",     
    "#FFFF99",
    "#F49AC2",  
    "blue"    
    )


# Generate all potential variable names
all_variables <- paste0("v", 1:21)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_21[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=21.\n fastStructure inference for k21 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_21) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "faststructure_k=21_euro_global_r2_1_run4.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.7 K=23 plots

Plots for fastStructure K=23 for r2<0.1

2.7.1 Extract ancestry coefficients for k=23

change to correct run for matrix

fask23 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run023/simple.23.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask23)
## # A tibble: 6 × 23
##      X1    X2     X3    X4    X5    X6    X7     X8    X9   X10   X11   X12
##   <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     0     0 0.0125     0 0.601     0     0 0          0 0         0     0
## 2     0     0 0          0 0.437     0     0 0.0962     0 0.221     0     0
## 3     0     0 0          0 1.00      0     0 0          0 0         0     0
## 4     0     0 0          0 1.00      0     0 0          0 0         0     0
## 5     0     0 0          0 1.00      0     0 0          0 0         0     0
## 6     0     0 0          0 1.00      0     0 0          0 0         0     0
## # ℹ 11 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>, X23 <dbl>

The fam file

fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.1.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask23 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask23)

head(fask23)
##    ind pop X1 X2       X3 X4       X5 X6 X7       X8 X9      X10 X11 X12
## 1 1001 OKI  0  0 0.012521  0 0.600878  0  0 0.000000  0 0.000000   0   0
## 2 1002 OKI  0  0 0.000000  0 0.436931  0  0 0.096157  0 0.220841   0   0
## 3 1003 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
## 4 1004 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
## 5 1005 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
## 6 1006 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
##        X13      X14 X15 X16 X17 X18 X19 X20 X21 X22 X23
## 1 0.000000 0.386593   0   0   0   0   0   0   0   0   0
## 2 0.246063 0.000000   0   0   0   0   0   0   0   0   0
## 3 0.000000 0.000000   0   0   0   0   0   0   0   0   0
## 4 0.000000 0.000000   0   0   0   0   0   0   0   0   0
## 5 0.000000 0.000000   0   0   0   0   0   0   0   0   0
## 6 0.000000 0.000000   0   0   0   0   0   0   0   0   0

Rename the columns

# Rename the columns starting from the third one
fask23 <- fask23 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask23)
##    ind pop v1 v2       v3 v4       v5 v6 v7       v8 v9      v10 v11 v12
## 1 1001 OKI  0  0 0.012521  0 0.600878  0  0 0.000000  0 0.000000   0   0
## 2 1002 OKI  0  0 0.000000  0 0.436931  0  0 0.096157  0 0.220841   0   0
## 3 1003 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
## 4 1004 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
## 5 1005 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
## 6 1006 OKI  0  0 0.000000  0 0.999991  0  0 0.000000  0 0.000000   0   0
##        v13      v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
## 1 0.000000 0.386593   0   0   0   0   0   0   0   0   0
## 2 0.246063 0.000000   0   0   0   0   0   0   0   0   0
## 3 0.000000 0.000000   0   0   0   0   0   0   0   0   0
## 4 0.000000 0.000000   0   0   0   0   0   0   0   0   0
## 5 0.000000 0.000000   0   0   0   0   0   0   0   0   0
## 6 0.000000 0.000000   0   0   0   0   0   0   0   0   0

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask23 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

color_palette_23 <-
  c(
    "blue",
    "#75FAFF",
    "#1E90FF",
    "#FF8C1A",    
    "magenta",    
    "purple4",
    "#B22222",
    "#77DD77",
    "orangered",
    "goldenrod",
    "green4", 
    "purple",  
    "yellow2",    
    "green",
    "navy", 
    "coral",
    "#F49AC2",
    "chocolate4",
    "#B20CC9",
    "#AE9393",
    "orchid1",
    "#008080", 
    "#FFFF99"    
  )
  
# Generate all potential variable names
all_variables <- paste0("v", 1:23)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                          color = color_palette_23[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=23.\n fastStructure inference for k23 with 56,384 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_23) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "faststructure_k=23_euro_global_r2_1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

1. SNP Set 3: MAF 1% Euro_global dataset structure analyses

1.1 Run Simple prior in fastStructure for file with LD pruning with r2=0.01 & MAF 1%

1.1.1 Check populations in file

awk '{print $1}' euro_global/output/snps_sets/r2_0.1_b.fam | sort | uniq -c | awk '{print $2}' 
## ALD
## ALU
## ALV
## ARM
## BAR
## BEN
## BER
## BRE
## BUL
## CAM
## CES
## CHA
## CRO
## DES
## FRS
## GEL
## GES
## GRA
## GRC
## GRV
## HAI
## HAN
## HOC
## HUN
## IMP
## INJ
## INW
## ITB
## ITP
## ITR
## JAF
## KAC
## KAG
## KAN
## KAT
## KER
## KLP
## KRA
## KUN
## LAM
## MAL
## MAT
## OKI
## PAL
## POL
## POP
## QNC
## RAR
## REC
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SON
## SPB
## SPC
## SPM
## SPS
## SSK
## STS
## SUF
## SUU
## TAI
## TIK
## TIR
## TRE
## TUA
## TUH
## UTS
## YUN

Count the total number of population

awk '{print $1}' euro_global/output/snps_sets/r2_0.1_b.fam | sort | uniq -c | wc -l
#73
## 73

1.1.2 Create a directory for each run.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure
mkdir MAF_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1
for i in $(seq -w 1 100)
do
  mkdir run$i
done
#ls #should have 100 folders now

1.1.3 Create the dsq files

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1
module load dSQ/1.05 

for k in $(seq 1 25); do
    for run in $(seq -w 1 100); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b --output=/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run$run/simple --full --cv=10 --tol=10e-6"
    done
done > dsq_euro_global_MAF_1.txt

head dsq_euro_global_MAF_1.txt #check file

dsq --job-file dsq_euro_global_MAF_1.txt --output /dev/null --mail-type ALL -t 24:00:00 --partition=ycga --cpus-per-task=10 --job-name fS.simple_euro_global_MAF_1 --batch-file simple_euro_global_MAF_1.sh

#Check the files to see if they have 2500 jobs:
head simple_euro_global_MAF_1.sh

1.1.4 Run the jobs

Run a random job from our array to see if works

sbatch --array=$((1 + $RANDOM % 10)) simple_euro_global_MAF_1.sh

# check status
dsqa -j 18536432 #insert job number

#if it looks like its running ok, you can cancel it 
scancel 18536432

Submit all jobs.

sbatch simple_euro_global_MAF_1.sh
#Submitted batch job # 

#Check it
dsqa -j 18549658 #add job number here

1.1.5 Run autopsy once its done

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1
module load dSQ/1.05

dsqa -j 18549658 #all completed

1.1.6. Find the optimal number of K.

you will see it varies from run to run. You can estimate mean, mode and median values. Then you choose one that makes sense.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session

module load miniconda
conda activate fastStructure 

#First check files
ls run*/* | wc -l # should be 12500

# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
   #      --input=<file>

for i in cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run*
do
    echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done

Get outputs in a file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1

cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run*
do
    echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' |  awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_euro_global_MAF_1_25k.txt

head simple_euro_global_MAF_1_25k.txt #check the file
#ok

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1
cp -r simple_euro_global_MAF_1_25k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global #copy file 

1.1.7 Make a plot with the Ks.

Let’s import the data in R.

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
#library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
library(hrbrthemes)         # not really needed, personal preference for plots
#library(hrbrmisc)           # not really needed, personal preference for plots
# library(stringr)          # for strings operations
#library(ggstatsplot)        # statistics and plotting
library(flextable)          # create tables
library(officer)            # export office format
library(here)

Import fastStucture

# function to import our choosek.py data
import_fastStructure <- function(file) {
  # import as a tibble and set columns as integers
  dat <- read_delim(
    file,
    col_names      = TRUE,
    show_col_types = FALSE,
    col_types      = "iii"
  )

  # get columns we need and make it long for plotting
  dat <- dat |>
    gather(
      structure, likelihood, -run
    )

  # rename the columns by index
  dat <- dat |>
    rename(
      run   = 1,
      model = 2,
      k     = 3
    )
  return(dat)
}

Run import function for simple prior models for simple euro_global Set 3 (MAF<1%)

choose_k_simple <- 
  import_fastStructure(
  here("scripts", "RMarkdowns", "output", "euro_global", "simple_euro_global_MAF_1_25k.txt"
  )
)

Function to plot fastStructure choosek.py results

# function to plot our choosek.py data
plot_fastStructure <- function(df) {
  df |>
    ggplot() +
    geom_line(
      aes(
        x              = run,
        y              = k,
        color          = model
      ),
      linewidth = 1
    ) +
    scale_colour_manual(
      "model",
      values = c(
        structure      = "#9d60ff",
        likelihood     = "#ffc08c"
      ),
      labels = c(
        "Maximizes \n Likelihood \n", "Explain \n Structure"
      )
    ) +
    labs(
      x                = "Run",
      y                = "K",
      title            = "fastStructure simple for Europe, Asia & the Americas (MAF 1%)",
      caption          = "algorithm runs for K ranging from 1 to 25"
    ) +
    theme(
      panel.grid.major = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      panel.grid.minor = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      legend.text = element_text(
        size = 12
      ),
      legend.title = element_text(
        size           = 14,
        face           = "bold"
      ),
      legend.position = "right"
    )
}

Use our function to plot k values

plot_fastStructure(
  choose_k_simple
) +
  labs(
    subtitle = "simple fastStructure for Europe, Asia & the Americas (MAF 1%)"
  )

# save plot
ggsave(
  here("scripts", "RMarkdowns",
    "output", "euro_global", "fastStructure", "fastStructure_simple_k25_euro_global_MAF_1.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

1.1.8 Find mean K maximizing likelihood

choose_k_likelihood <- subset(choose_k_simple, model != "structure")
mean(choose_k_likelihood$k) #23.07
## [1] 23.07
median(choose_k_likelihood$k) #24
## [1] 24
common <- table(choose_k_likelihood$k)
common #K=25
## 
## 17 19 20 21 22 23 24 25 
##  1  3  7 13 11 14 19 32

1.2 Run admixture for SNP Set 3 (MAF 1%, r2<0.01)

1.2.1 Export path

# Add this in your script
export PATH=$PATH:/home/mkc54/admixture/releases/admixture_linux-1.3.0/

# Then call admixture from anywhere
admixture -h

1.2.2 Create directories for r_1

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture

mkdir MAF_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1

for i in $(seq 1 5)
do
  mkdir run$i
done

1.2.3 Generate script for Admixture. We will use 2000 bootstraps, and cv=10.


admixture r2_0.1.bed $name -j20 --cv=10 -B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads

Now generate the full script for dsq.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1
for run in $(seq 1 5); do
    for k in $(seq 1 25); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B2000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b.bed $k | tee log_k$k.txt"
    done
done > dsqr_1.txt  #  jobs
module load dSQ/1.05
# make script
dsq \
--job-file dsqr_1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 10g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_euro_global_MAF \
--batch-file admixture_euro_global_MAF.sh 
sed -i '6 i #SBATCH --requeue' admixture_euro_global_MAF.sh;

#check it
head admixture_euro_global_MAF.sh

1.2.4 Run Admixture for SNP Set 3 for euro_global

Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.

# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_euro_global_MAF.sh
# Submitted batch job 20520272
# check status
dsqa -j 20520272  # if no errors, we can go ahead and submit all jobs
scancel 20520272

If it runs without problems, we can submit all jobs.

sbatch admixture_euro_global_MAF.sh
# Submitted batch job 
#original 

# check status
dsqa -j 20520357

1.2.5 Run autopsy once it is done.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1

dsqa -j 20520357 -f dsqr_1.txt -s TIMEOUT > rerun_jobs.txt; wc -l rerun_jobs.txt
#3 timedout

dsq \
--job-file rerun_jobs.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 50g \
-t 24:00:00 \
--cpus-per-task=20 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_euro_global_MAF_rr \
--batch-file admixture_euro_global_MAF_rr.sh 
sed -i '6 i #SBATCH --requeue' admixture_euro_global_MAF_rr.sh;

head admixture_euro_global_MAF_rr.sh

#Export path again before re-running
export PATH=$PATH:/home/mkc54/admixture/releases/admixture_linux-1.3.0/

# Then call admixture from anywhere
admixture -h

#submit reruns
sbatch admixture_euro_global_MAF_rr.sh #20616031

dsqa -j 20616031

dsqa -j 20616031 -f rerun_jobs.txt -s CANCELLED > rerun_jobs2.txt; wc -l rerun_jobs2.txt
dsqa -j 20616031 -f rerun_jobs.txt -s TIMEOUT > rerun_jobs3.txt; wc -l rerun_jobs3.txt

#IF NEEDED...
dsq \
--job-file rerun_jobs2.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 20g \
-t 7-00:00:00 \
--cpus-per-task=30 \
--mail-type ALL \
--partition=week \
--job-name admixture_euro_global_rr2 \
--batch-file admixture_euro_global_rr2.sh 
sed -i '6 i #SBATCH --requeue' admixture_euro_global_rr2.sh;

sbatch admixture_euro_global_rr2.sh
#

dsqa -j 


dsqa -j  -f rerun_jobs2.txt -s FAILED > rerun_jobs_new.txt; wc -l rerun_jobs_new.txt

dsq \
--job-file rerun_jobs_new.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 20g \
-t 7-00:00:00 \
--cpus-per-task=30 \
--mail-type ALL \
--partition=week \
--job-name admixture_euro_global_rr4 \
--batch-file admixture_euro_global_rr4.sh 
sed -i '6 i #SBATCH --requeue' admixture_euro_global_rr4.sh;
sbatch admixture_euro_global_rr4.sh

dsqa -j 16868656 #running

dsq \
--job-file rerun_jobs3.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 20g \
-t 7-00:00:00 \
--cpus-per-task=30 \
--mail-type ALL \
--partition=week \
--job-name admixture_euro_global_rr3 \
--batch-file admixture_euro_global_rr3.sh 
sed -i '6 i #SBATCH --requeue' admixture_euro_global_rr3.sh;
sbatch admixture_euro_global_rr3.sh #15851231

dsqa -j 20616031 #completed
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1
# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 10
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 1
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt 

Load the libraries

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(flextable)          # create tables
#library(ggstatsplot)        # statistics and plotting
# library(ggplot2)          # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
# library(hrbrthemes)       # not really needed, personal preference for plots
# library(hrbrmisc)         # not really needed, personal preference for plots
library(stringr)            # for strings operations
library(flextable)          # create tables
library(officer)            # export office format

1.2.6 Get the cross validation values

Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.

# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k13.txt:CV error (K=13): 0.64586
#run1/log_k1.txt:CV error (K=1): 0.71066
#run1/log_k2.txt:CV error (K=2): 0.68716

1.2.7 Parse the logs for allsnps

Make loop to get summary from all 5 runs

# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
    echo $i 
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt

# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.69538
#1 11 0.69533
# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_euro_global_MAF1.txt
# we should have 150 lines in our file
wc -l cross_validation_euro_global_MAF1.txt
# 125 

head cross_validation_euro_global_MAF1.txt
# it looks okay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1
cp -r cross_validation_euro_global_MAF1.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/euro_global

Import the cross validation values into R

# the files are in the output directory
##  ............................................................................
#cd /vast/palmer/scratch/caccone/mkc54/albo/admixture_native
#working directory /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/admixture_native

cross_val <-
  read_delim(
    here(
      "/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/euro_global/cross_validation_euro_global_MAF1.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "iin"
  ) |>
  rename(
    run = 1,
    k   = 2,
    cv  = 3
  ) |>
  mutate(
    run = as.factor(
      run
    )
  )

Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation

##  ............................................................................
##  1000 bootstraps and 10 folds cv                                        ####
#
# make plot
cross_val |>
  ggplot() +
  geom_line(
    aes(
      x        = k,
      y        = cv,
      group    = run,
      color    = run,
      linetype = run
    ),
    linewidth  = .75
  ) +
  labs(
    x        = "K",
    y        = "Cross-validation error",
    title    = "Admixture Cross-validation for Europe, Americas & Native range (22,642 SNPs)",
    subtitle = "2000 bootstraps and cv = 10 ",
    caption  = "algorithm runs for choices of K ranging from 1 to 25"
  ) +
  scale_color_manual(
    values = c(
      "#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
    )
  ) +
  theme(
    panel.grid.major = element_line(
      linetype       = "dashed",
      linewidth      = 0.2,
    ),
    legend.title     = element_text(
        size           = 14,
        face           = "bold"
      ),
    panel.grid.minor = element_line(
      linetype       = "dashed",
      linewidth      = 0.2
    )
  )

# save the plot
ggsave(
  here(
    "scripts", "RMarkdowns", "analyses", "euro_global", "admixture_euro_global_MAF_2000Bs_CV_k1_25.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

Now we can find the K with the lowest cross-validation error

##  1000 bootstraps and 10 folds cv                                         #
# to find the lowest cv error for each run
cross_val |>
  group_by(
    run
  ) |>
  summarize(
    LowestCVerror = min(
      cv,
      na.rm       = T
    )
  ) |>
  arrange(
    run
  ) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
  group_by(
    run
  ) |>
  slice(
    which.min(
      cv
    )
  ) |>
  arrange(
    run
  ) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
  left_join(
    cv_min,
    k_with_lowest_cv,
    by = "run"
  ) |>
    select(
      -cv
  )
#

# make table
ftbadmix <-
  flextable(
    admix_ks |>
      rename(
        Run               = 1,
        "Lowest CV error" = 2,
        K                 = 3,
      )
  )|>
  set_caption(
    as_paragraph(
      as_chunk(
        "Ks with lowest cross-validation errors for 5 runs of Admixture (22,642 SNPs)",
        props             = fp_text_default(
          font.fam        = "Cambria"
        )
      )
    ),
    word_stylena          = "Table Caption"
  )
#
# check the table
autofit(ftbadmix)
Ks with lowest cross-validation errors for 5 runs of Admixture (22,642 SNPs)

Run

Lowest CV error

K

1

0.68721

18

2

0.68690

21

3

0.68742

21

4

0.68780

18

5

0.68790

17

#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
  prop_section(
  page_size    = page_size(
    orient     = "portrait",
    width      = 8.3, 
    height     = 11.7
  ),
  type         = "continuous",
  page_margins = page_mar()
)

Best: K=21 (run 2)

# save
# then you can open MS Word and make adjustments
ftbadmix |>
  autofit() |>
  save_as_docx(
    path       = here(
      "scripts", "RMarkdowns", "analyses", "euro_global", "admixture_euro_globa_CV_k1_25_MAF1.docx"
    ),
    pr_section = sect_properties
  )

2. Plots for fastStructure for SNP Set 3 (MAF 1%)

2.1 Plot k=20

Using ggplot2 for individual admixtures

2.1.1 Extract ancestry coefficients for k=20

leak20 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run022/simple.20.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(leak20)
## # A tibble: 6 × 20
##         X1       X2     X3    X4    X5    X6    X7    X8    X9   X10   X11   X12
##      <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.165    0.000001   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 0.586  1e-6
## 2 0.000001 0.000001   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 0.418  1e-6
## 3 0.000001 0.000001   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1.00   1e-6
## 4 0.000001 0.000001   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1.00   1e-6
## 5 0.000001 0.000001   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1.00   1e-6
## 6 0.000001 0.000001   1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1.00   1e-6
## # ℹ 8 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

leak20 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak20)

head(leak20)
##    ind pop       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1 1001 OKI 0.164897 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1002 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1003 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1004 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1005 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1006 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
##        X11   X12   X13   X14      X15      X16      X17      X18   X19      X20
## 1 0.586264 1e-06 1e-06 1e-06 0.097707 0.000001 0.117853 0.033261 1e-06 0.000001
## 2 0.418060 1e-06 1e-06 1e-06 0.215431 0.086892 0.000001 0.000001 1e-06 0.279598
## 3 0.999979 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001
## 4 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001
## 5 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001
## 6 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001

Rename the columns

# Rename the columns starting from the third one
leak20 <- leak20 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak20)
##    ind pop       v1    v2    v3    v4    v5    v6    v7    v8    v9   v10
## 1 1001 OKI 0.164897 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1002 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1003 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1004 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1005 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1006 OKI 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
##        v11   v12   v13   v14      v15      v16      v17      v18   v19      v20
## 1 0.586264 1e-06 1e-06 1e-06 0.097707 0.000001 0.117853 0.033261 1e-06 0.000001
## 2 0.418060 1e-06 1e-06 1e-06 0.215431 0.086892 0.000001 0.000001 1e-06 0.279598
## 3 0.999979 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001
## 4 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001
## 5 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001
## 6 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 1e-06 0.000001

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

2.1.2 Using ggplot2 for individual admixtures for k=20

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- leak20 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 
color_palette_20 <-
  c("chocolate4",
    "#F49AC2",
    "green", 
    "green4",
    "purple4",
    "#008080",
    "#1E90FF",  
    "#B22222",
    "navy", 
    "orangered",
    "#FFFF99",
    "#75FAFF",
    "blue",
    "#FF8C1A",
    "purple",
    "#AE9393",
    "magenta", 
    "#77DD77",
    "#B20CC9",
    "yellow2")
all_variables <- c("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19","v20")

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette_20[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
 scale_fill_manual(values = color_palette_20) +
#  expand_limits(y = c(0, 1.5))
#  scale_fill_manual(values = color_mapping$color) +  # Updated line
  expand_limits(y = c(0, 1.5))

color_palette_20 <-
  c(
    "blue",
    "#F49AC2",
    "#008080",
    "goldenrod",
    "green4",
    "#1E90FF", 
    "#B20CC9",
    "navy", 
    "#B22222",
    "orangered",
    "#77DD77",
    "green", 
    "#FFFF99",
    "#FF8C1A",   
    "purple",
    "#AE9393",
    "magenta", 
    "#75FAFF",
    "purple4",    
    "yellow2" 
)

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
 scale_fill_manual(values = color_palette_20) +
#  expand_limits(y = c(0, 1.5))
#  scale_fill_manual(values = color_mapping$color) +  # Updated line
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=20_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.2 Plot k=23

Using ggplot2 for individual admixtures

2.2.1 Extract ancestry coefficients for k=23

fask23 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run033/simple.23.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask23)
## # A tibble: 6 × 23
##         X1      X2    X3    X4    X5    X6    X7    X8    X9     X10   X11   X12
##      <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 0.000001    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 2 0.000001    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 3.91e-2  1e-6  1e-6
## 3 0.000001    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 4 0.000001    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 5 0.000001    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 6 0.000001    1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## # ℹ 11 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>, X23 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask23 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask23)

head(fask23)
##    ind pop    X1    X2    X3    X4    X5    X6    X7    X8    X9      X10   X11
## 1 1001 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 2 1002 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.039104 1e-06
## 3 1003 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 4 1004 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 5 1005 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 6 1006 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
##     X12      X13      X14   X15   X16   X17      X18      X19      X20      X21
## 1 1e-06 0.507835 0.000001 1e-06 1e-06 1e-06 0.003427 0.003905 0.367266 0.064383
## 2 1e-06 0.389715 0.414105 1e-06 1e-06 1e-06 0.000001 0.000001 0.089450 0.000001
## 3 1e-06 0.000001 0.000001 1e-06 1e-06 1e-06 0.000001 0.000001 0.999978 0.000001
## 4 1e-06 0.000001 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001
## 5 1e-06 0.000001 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001
## 6 1e-06 0.000001 0.000001 1e-06 1e-06 1e-06 0.000001 0.000001 0.999978 0.000001
##     X22      X23
## 1 1e-06 0.053166
## 2 1e-06 0.067609
## 3 1e-06 0.000001
## 4 1e-06 0.000001
## 5 1e-06 0.000001
## 6 1e-06 0.000001

Rename the columns

# Rename the columns starting from the third one
fask23 <- fask23 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask23)
##    ind pop    v1    v2    v3    v4    v5    v6    v7    v8    v9      v10   v11
## 1 1001 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 2 1002 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.039104 1e-06
## 3 1003 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 4 1004 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 5 1005 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
## 6 1006 OKI 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 0.000001 1e-06
##     v12      v13      v14   v15   v16   v17      v18      v19      v20      v21
## 1 1e-06 0.507835 0.000001 1e-06 1e-06 1e-06 0.003427 0.003905 0.367266 0.064383
## 2 1e-06 0.389715 0.414105 1e-06 1e-06 1e-06 0.000001 0.000001 0.089450 0.000001
## 3 1e-06 0.000001 0.000001 1e-06 1e-06 1e-06 0.000001 0.000001 0.999978 0.000001
## 4 1e-06 0.000001 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001
## 5 1e-06 0.000001 0.999978 1e-06 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001
## 6 1e-06 0.000001 0.000001 1e-06 1e-06 1e-06 0.000001 0.000001 0.999978 0.000001
##     v22      v23
## 1 1e-06 0.053166
## 2 1e-06 0.067609
## 3 1e-06 0.000001
## 4 1e-06 0.000001
## 5 1e-06 0.000001
## 6 1e-06 0.000001

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_23 <-
  c(
    "purple4",
    "purple", 
    "orangered",
    "#FF8C1A",
    "#F49AC2",
    "magenta",
    "#AE9393",
    "#FFFF99",
    "orchid1",
    "#1E90FF",
    "chocolate4",
    "#B20CC9",
    "goldenrod",
    "#77DD77",
    "blue",
    "navy", 
    "green",
    "green4", 
    "#B22222",
    "#008080", 
    "coral",
    "yellow2",
    "#75FAFF"
      )

2.2.2 Using ggplot2 for individual admixtures for k=23

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask23 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:23)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_23[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=23.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_23) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=23_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)
color_palette_23 <-
  c(
    "orangered",   
    "green4", 
    "yellow2",   
    "#008080", 
    "#1E90FF",   
    "#B22222",
    "#B20CC9",   
    "green",
    "#75FAFF",    
    "purple4",
    "coral",    
    "blue",
    "purple",   
    "#FF8C1A",
    "#F49AC2",
    "#77DD77",
    "#AE9393",    
    "navy",     
    "goldenrod",    
    "#FFFF99",            
    "magenta",
    "orchid1",  
    "chocolate4"
      )
  

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=23.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_23) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=23_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.3 Plot k=6

Using ggplot2 for individual admixtures

2.3.1 Extract ancestry coefficients for k=6

fask6 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run006/simple.6.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask6)
## # A tibble: 6 × 6
##      X1      X2       X3    X4     X5    X6
##   <dbl>   <dbl>    <dbl> <dbl>  <dbl> <dbl>
## 1 0.477 0.0398  0.000004 0.189 0.0894 0.204
## 2 0.474 0.0232  0.000004 0.221 0.0928 0.189
## 3 0.548 0.00873 0.000004 0.195 0.0959 0.153
## 4 0.512 0.00165 0.000004 0.190 0.0714 0.225
## 5 0.533 0.00357 0.000004 0.174 0.0907 0.199
## 6 0.531 0.00668 0.000004 0.199 0.0727 0.190

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask6 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask6)

head(fask6)
##    ind pop       X1       X2    X3       X4       X5       X6
## 1 1001 OKI 0.477372 0.039770 4e-06 0.189186 0.089406 0.204263
## 2 1002 OKI 0.474184 0.023159 4e-06 0.220753 0.092837 0.189063
## 3 1003 OKI 0.548183 0.008728 4e-06 0.194525 0.095937 0.152624
## 4 1004 OKI 0.511817 0.001650 4e-06 0.190439 0.071414 0.224676
## 5 1005 OKI 0.532851 0.003570 4e-06 0.174067 0.090744 0.198764
## 6 1006 OKI 0.530726 0.006682 4e-06 0.199365 0.072725 0.190498

Rename the columns

# Rename the columns starting from the third one
fask6 <- fask6 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask6)
##    ind pop       v1       v2    v3       v4       v5       v6
## 1 1001 OKI 0.477372 0.039770 4e-06 0.189186 0.089406 0.204263
## 2 1002 OKI 0.474184 0.023159 4e-06 0.220753 0.092837 0.189063
## 3 1003 OKI 0.548183 0.008728 4e-06 0.194525 0.095937 0.152624
## 4 1004 OKI 0.511817 0.001650 4e-06 0.190439 0.071414 0.224676
## 5 1005 OKI 0.532851 0.003570 4e-06 0.174067 0.090744 0.198764
## 6 1006 OKI 0.530726 0.006682 4e-06 0.199365 0.072725 0.190498

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_6 <-
    c(
    "#77DD37",  
    "red",
    "purple3",
    "#FFFF19",
    "#1E90FF",
    "#FF8C1A"
        )

2.3.2 Using ggplot2 for individual admixtures fpr k=6

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask6 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:6)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_6[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_6) +
  expand_limits(y = c(0, 1.5))

color_palette_6 <-
    c(
    "purple3",
    "#FF8C1A",
    "#FFFF19",
    "#1E90FF",
    "red",
    "#77DD37"
)

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_6) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=6_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.4 Plot k=7

2.4.1 Extract ancestry coefficients for k=7

fask7 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run006/simple.7.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask7)
## # A tibble: 6 × 7
##         X1     X2    X3       X4    X5       X6     X7
##      <dbl>  <dbl> <dbl>    <dbl> <dbl>    <dbl>  <dbl>
## 1 0.000003 0.103  0.588 0.000003 0.184 0.0836   0.0410
## 2 0.000003 0.107  0.565 0.000003 0.215 0.0655   0.0471
## 3 0.000003 0.115  0.638 0.000003 0.189 0.000003 0.0577
## 4 0.000003 0.0973 0.585 0.000003 0.187 0.000003 0.131 
## 5 0.000003 0.115  0.611 0.000003 0.169 0.000003 0.105 
## 6 0.000003 0.0947 0.624 0.000003 0.193 0.000003 0.0876

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask7 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask7)

head(fask7)
##    ind pop    X1       X2       X3    X4       X5       X6       X7
## 1 1001 OKI 3e-06 0.103390 0.587725 3e-06 0.184253 0.083585 0.041040
## 2 1002 OKI 3e-06 0.107196 0.564975 3e-06 0.215213 0.065475 0.047134
## 3 1003 OKI 3e-06 0.115153 0.638134 3e-06 0.189013 0.000003 0.057691
## 4 1004 OKI 3e-06 0.097312 0.585401 3e-06 0.186516 0.000003 0.130761
## 5 1005 OKI 3e-06 0.114776 0.611126 3e-06 0.169186 0.000003 0.104902
## 6 1006 OKI 3e-06 0.094743 0.624495 3e-06 0.193161 0.000003 0.087592

Rename the columns

# Rename the columns starting from the third one
fask7 <- fask7 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask7)
##    ind pop    v1       v2       v3    v4       v5       v6       v7
## 1 1001 OKI 3e-06 0.103390 0.587725 3e-06 0.184253 0.083585 0.041040
## 2 1002 OKI 3e-06 0.107196 0.564975 3e-06 0.215213 0.065475 0.047134
## 3 1003 OKI 3e-06 0.115153 0.638134 3e-06 0.189013 0.000003 0.057691
## 4 1004 OKI 3e-06 0.097312 0.585401 3e-06 0.186516 0.000003 0.130761
## 5 1005 OKI 3e-06 0.114776 0.611126 3e-06 0.169186 0.000003 0.104902
## 6 1006 OKI 3e-06 0.094743 0.624495 3e-06 0.193161 0.000003 0.087592

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_7 <-
  c(
    "red",
    "#77DD37",  
    "#FF8C1A",    
    "#FFFF19",
    "#75FAFF", 
    "#1E90FF",   
    "purple3")

2.4.2 Using ggplot2 for individual admixtures for K=7

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask7 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:7)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_7[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=7.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_7) +
  expand_limits(y = c(0, 1.5))

color_palette_7 <-
  c(
    "#77DD37",      
    "#1E90FF",      
    "purple3",    
    "#FF8C1A",  
    "#75FAFF",     
    "#FFFF19",   
    "red")

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=7.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_7) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=7_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.5 Plot k=5

2.5.1 Extract ancestry coefficients for k=5

fask5 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run006/simple.5.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask5)
## # A tibble: 6 × 5
##      X1    X2     X3    X4       X5
##   <dbl> <dbl>  <dbl> <dbl>    <dbl>
## 1 0.184 0.208 0.103  0.505 0.000044
## 2 0.216 0.192 0.106  0.487 0.000005
## 3 0.193 0.162 0.114  0.531 0.000005
## 4 0.190 0.230 0.0900 0.491 0.000005
## 5 0.172 0.205 0.109  0.514 0.000005
## 6 0.197 0.199 0.0905 0.514 0.000005

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask5)

head(fask5)
##    ind pop       X1       X2       X3       X4      X5
## 1 1001 OKI 0.184249 0.207861 0.102813 0.505033 4.4e-05
## 2 1002 OKI 0.215789 0.192029 0.105640 0.486537 5.0e-06
## 3 1003 OKI 0.193099 0.162094 0.113513 0.531289 5.0e-06
## 4 1004 OKI 0.189509 0.229674 0.089994 0.490818 5.0e-06
## 5 1005 OKI 0.172215 0.205026 0.108905 0.513849 5.0e-06
## 6 1006 OKI 0.197458 0.198503 0.090508 0.513526 5.0e-06

Rename the columns

# Rename the columns starting from the third one
fask5 <- fask5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask5)
##    ind pop       v1       v2       v3       v4      v5
## 1 1001 OKI 0.184249 0.207861 0.102813 0.505033 4.4e-05
## 2 1002 OKI 0.215789 0.192029 0.105640 0.486537 5.0e-06
## 3 1003 OKI 0.193099 0.162094 0.113513 0.531289 5.0e-06
## 4 1004 OKI 0.189509 0.229674 0.089994 0.490818 5.0e-06
## 5 1005 OKI 0.172215 0.205026 0.108905 0.513849 5.0e-06
## 6 1006 OKI 0.197458 0.198503 0.090508 0.513526 5.0e-06

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_5 <-
  c(
    "red",
    "#FF8C1A",    
    "#FFFF19",
    "#1E90FF",   
    "purple3")

2.5.2 Using ggplot2 for individual admixture for K=5

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))


# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:5)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_5[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_5) +
  expand_limits(y = c(0, 1.5))

color_palette_5 <-
  c(
    "#FF8C1A",  
    "#FFFF19",    
    "purple3",
    "red",
    "#1E90FF")

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_5) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=5_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.6 Plot k=22

2.6.1 Extract ancestry coefficients for k=22

fask22 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run039/simple.22.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask22)
## # A tibble: 6 × 22
##         X1       X2       X3       X4       X5    X6    X7      X8      X9   X10
##      <dbl>    <dbl>    <dbl>    <dbl>    <dbl> <dbl> <dbl>   <dbl>   <dbl> <dbl>
## 1 0.354    0.0226   0.000001 0.000001 0.439     1e-6  1e-6 1   e-6 1   e-6  1e-6
## 2 0.226    0.000001 0.00228  0.000001 0.000001  1e-6  1e-6 3.21e-1 1.63e-1  1e-6
## 3 1.00     0.000001 0.000001 0.000001 0.000001  1e-6  1e-6 1   e-6 1   e-6  1e-6
## 4 0.000001 0.000001 0.000001 0.000001 0.000001  1e-6  1e-6 1   e-6 1   e-6  1e-6
## 5 0.000001 0.000001 0.000001 0.000001 0.000001  1e-6  1e-6 1   e-6 1   e-6  1e-6
## 6 1.00     0.000001 0.000001 0.000001 0.000001  1e-6  1e-6 1   e-6 1   e-6  1e-6
## # ℹ 12 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

fask22 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask22)

head(fask22)
##    ind pop       X1       X2       X3    X4       X5    X6    X7       X8
## 1 1001 OKI 0.353803 0.022551 0.000001 1e-06 0.438722 1e-06 1e-06 0.000001
## 2 1002 OKI 0.225560 0.000001 0.002276 1e-06 0.000001 1e-06 1e-06 0.320784
## 3 1003 OKI 0.999978 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 4 1004 OKI 0.000001 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 5 1005 OKI 0.000001 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 6 1006 OKI 0.999978 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
##         X9   X10      X11   X12      X13   X14      X15   X16   X17      X18
## 1 0.000001 1e-06 0.070591 1e-06 0.000001 1e-06 0.000001 1e-06 1e-06 0.114315
## 2 0.162949 1e-06 0.000001 1e-06 0.000001 1e-06 0.001087 1e-06 1e-06 0.257640
## 3 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 4 0.000001 1e-06 0.000001 1e-06 0.999978 1e-06 0.000001 1e-06 1e-06 0.000001
## 5 0.000001 1e-06 0.000001 1e-06 0.999978 1e-06 0.000001 1e-06 1e-06 0.000001
## 6 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
##     X19   X20      X21   X22
## 1 1e-06 1e-06 0.000001 1e-06
## 2 1e-06 1e-06 0.029688 1e-06
## 3 1e-06 1e-06 0.000001 1e-06
## 4 1e-06 1e-06 0.000001 1e-06
## 5 1e-06 1e-06 0.000001 1e-06
## 6 1e-06 1e-06 0.000001 1e-06

Rename the columns

# Rename the columns starting from the third one
fask22 <- fask22 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(fask22)
##    ind pop       v1       v2       v3    v4       v5    v6    v7       v8
## 1 1001 OKI 0.353803 0.022551 0.000001 1e-06 0.438722 1e-06 1e-06 0.000001
## 2 1002 OKI 0.225560 0.000001 0.002276 1e-06 0.000001 1e-06 1e-06 0.320784
## 3 1003 OKI 0.999978 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 4 1004 OKI 0.000001 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 5 1005 OKI 0.000001 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 6 1006 OKI 0.999978 0.000001 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
##         v9   v10      v11   v12      v13   v14      v15   v16   v17      v18
## 1 0.000001 1e-06 0.070591 1e-06 0.000001 1e-06 0.000001 1e-06 1e-06 0.114315
## 2 0.162949 1e-06 0.000001 1e-06 0.000001 1e-06 0.001087 1e-06 1e-06 0.257640
## 3 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
## 4 0.000001 1e-06 0.000001 1e-06 0.999978 1e-06 0.000001 1e-06 1e-06 0.000001
## 5 0.000001 1e-06 0.000001 1e-06 0.999978 1e-06 0.000001 1e-06 1e-06 0.000001
## 6 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 0.000001 1e-06 1e-06 0.000001
##     v19   v20      v21   v22
## 1 1e-06 1e-06 0.000001 1e-06
## 2 1e-06 1e-06 0.029688 1e-06
## 3 1e-06 1e-06 0.000001 1e-06
## 4 1e-06 1e-06 0.000001 1e-06
## 5 1e-06 1e-06 0.000001 1e-06
## 6 1e-06 1e-06 0.000001 1e-06

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_22 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "#1E90FF",
    "#B22222",
    "chocolate4",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "purple",
    "orangered2",
    "goldenrod3",
    "coral")

2.6.2 Using ggplot2 for individual admixture for K=22

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask22 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:22)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_22[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=22.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_22) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=22_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)
color_palette_22 <-c(
    "navy", 
    "#B20CC9", 
    "orangered2",
    "magenta",
    "goldenrod3",    
    "green4",
    "#FF8C1A",
    "#F49AC2",
    "#008080", 
    "purple",
    "#77DD77",
    "#B22222",
    "coral",
    "#FFFF99",
    "#75FAFF",
    "green",
    "blue",
    "yellow2",  
    "#AE9393",   
    "chocolate4",    
    "purple4",
    "#1E90FF"    
  )
  
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=22.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_22) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=22_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

2.7 Plot k=21

2.7.1 Extract ancestry coefficients for k=21

leak21 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/MAF_1/run046/simple.21.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(leak21)
## # A tibble: 6 × 21
##         X1      X2    X3    X4    X5    X6    X7    X8    X9     X10   X11   X12
##      <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 0.000001 1.70e-1  1e-6  1e-6 0.731  1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 2 0.000001 1   e-6  1e-6  1e-6 0.389  1e-6  1e-6  1e-6  1e-6 8.72e-2  1e-6  1e-6
## 3 0.000001 1   e-6  1e-6  1e-6 1.00   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 4 0.000001 1   e-6  1e-6  1e-6 1.00   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 5 0.000001 1   e-6  1e-6  1e-6 1.00   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## 6 0.000001 1   e-6  1e-6  1e-6 1.00   1e-6  1e-6  1e-6  1e-6 1   e-6  1e-6  1e-6
## # ℹ 9 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

leak21 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(leak21)

head(leak21)
##    ind pop    X1       X2    X3    X4       X5    X6    X7    X8    X9      X10
## 1 1001 OKI 1e-06 0.169803 1e-06 1e-06 0.731067 1e-06 1e-06 1e-06 1e-06 0.000001
## 2 1002 OKI 1e-06 0.000001 1e-06 1e-06 0.388810 1e-06 1e-06 1e-06 1e-06 0.087172
## 3 1003 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
## 4 1004 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
## 5 1005 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
## 6 1006 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
##     X11   X12      X13      X14      X15      X16      X17      X18   X19   X20
## 1 1e-06 1e-06 0.000001 0.000001 0.000001 0.088150 0.010961 0.000001 1e-06 1e-06
## 2 1e-06 1e-06 0.024866 0.040790 0.397809 0.000001 0.000001 0.050008 1e-06 1e-06
## 3 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
## 4 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
## 5 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
## 6 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
##        X21
## 1 0.000001
## 2 0.010530
## 3 0.000001
## 4 0.000001
## 5 0.000001
## 6 0.000001

Rename the columns

# Rename the columns starting from the third one
leak21 <- leak21 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(leak21)
##    ind pop    v1       v2    v3    v4       v5    v6    v7    v8    v9      v10
## 1 1001 OKI 1e-06 0.169803 1e-06 1e-06 0.731067 1e-06 1e-06 1e-06 1e-06 0.000001
## 2 1002 OKI 1e-06 0.000001 1e-06 1e-06 0.388810 1e-06 1e-06 1e-06 1e-06 0.087172
## 3 1003 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
## 4 1004 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
## 5 1005 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
## 6 1006 OKI 1e-06 0.000001 1e-06 1e-06 0.999978 1e-06 1e-06 1e-06 1e-06 0.000001
##     v11   v12      v13      v14      v15      v16      v17      v18   v19   v20
## 1 1e-06 1e-06 0.000001 0.000001 0.000001 0.088150 0.010961 0.000001 1e-06 1e-06
## 2 1e-06 1e-06 0.024866 0.040790 0.397809 0.000001 0.000001 0.050008 1e-06 1e-06
## 3 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
## 4 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
## 5 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
## 6 1e-06 1e-06 0.000001 0.000001 0.000001 0.000001 0.000001 0.000001 1e-06 1e-06
##        v21
## 1 0.000001
## 2 0.010530
## 3 0.000001
## 4 0.000001
## 5 0.000001
## 6 0.000001

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

2.7.2 Using ggplot2 for individual admixtures for K=21

source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- leak21 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 
color_palette_21 <-
  c(
    "chocolate4",
    "green4",
    "#1E90FF",
    "#B20CC9",
    "navy",     
    "#75FAFF",    
    "#008080",
    "#77DD77",
    "magenta",   
    "orangered2",       
    "purple",
    "yellow2",
    "goldenrod3",
    "green",
    "#FF8C1A",
    "#AE9393",
    "purple4",
    "#B22222",     
    "#FFFF99",
    "#F49AC2",  
    "blue"    
    )
# Generate all potential variable names
#Q_grouped_filtered <- Q_grouped_filtered[,1:3]
#all_variables <- paste0("v", 1:20)

all_variables <- c("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19","v20", "v21")

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette_21[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=21.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
 scale_fill_manual(values = color_palette_21) +
#  expand_limits(y = c(0, 1.5))
#  scale_fill_manual(values = color_mapping$color) +  # Updated line
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=21_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)
color_palette_21 <-
  c(
    "coral", 
    "#B20CC9",     
    "green",
    "chocolate4",
    "magenta",  
    "#77DD77",
    "purple4",    
    "#F49AC2",  
    "#FF8C1A",
    "#008080",
    "navy",   
    "blue",
    "#AE9393",
    "green4",
    "purple",
    "#75FAFF",   
    "orangered2",  
    "yellow2",
    "#FFFF99",
    "#B22222",   
    "#1E90FF"
)

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=21.\n fastStructure inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
 scale_fill_manual(values = color_palette_21) +
#  expand_limits(y = c(0, 1.5))
#  scale_fill_manual(values = color_mapping$color) +  # Updated line
  expand_limits(y = c(0, 1.5))

ggsave(
  here("scripts", "RMarkdowns", "output", "euro_global", "fastStructure", "MAF_1", "fastStructure_k=21_euro_global_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3. Plots for admixture for SNP Set 3 (MAF 1%)

3.1 Admixture plot for K=21

3.1.1 Import the Q matrix (K21 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix21 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run2/r2_0.01_b.21.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix21)
## # A tibble: 6 × 21
##        X1      X2      X3      X4      X5      X6    X7      X8      X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.00001 0.00888 0.00001 0.00001 0.0590  0.0546  0.620 0.0173  0.00001 0.0638 
## 2 0.0111  0.0897  0.00780 0.00739 0.00001 0.0820  0.564 0.0160  0.00001 0.0702 
## 3 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 1.00  0.00001 0.00001 0.00001
## 4 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.933 0.00001 0.00625 0.0104 
## 5 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.993 0.00001 0.00001 0.00001
## 6 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 1.00  0.00001 0.00001 0.00001
## # ℹ 11 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix21 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix21)

head(admix21)
##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1001 OKI 0.000010 0.008885 0.000010 0.000010 0.058973 0.054559 0.620211
## 2 1002 OKI 0.011118 0.089664 0.007797 0.007394 0.000010 0.081994 0.563790
## 3 1003 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.999800
## 4 1004 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.933116
## 5 1005 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.993267
## 6 1006 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.999800
##         X8       X9      X10      X11      X12     X13   X14      X15      X16
## 1 0.017322 0.000010 0.063773 0.025953 0.038726 0.00024 1e-05 0.032562 0.010159
## 2 0.016013 0.000010 0.070222 0.031050 0.058734 0.00694 1e-05 0.024707 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001 1e-05 0.000010 0.000010
## 4 0.000010 0.006248 0.010398 0.015394 0.000010 0.00001 1e-05 0.002084 0.000010
## 5 0.000010 0.000010 0.000010 0.001392 0.000010 0.00001 1e-05 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001 1e-05 0.000010 0.000010
##        X17      X18      X19      X20      X21
## 1 0.000010 0.037947 0.000010 0.003427 0.027193
## 2 0.000010 0.006611 0.009499 0.014407 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.020956 0.011665 0.000010 0.000010 0.000010
## 5 0.000467 0.000010 0.000010 0.004705 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010

Rename the columns

# Rename the columns starting from the third one
admix21 <- admix21 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix21)
##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1001 OKI 0.000010 0.008885 0.000010 0.000010 0.058973 0.054559 0.620211
## 2 1002 OKI 0.011118 0.089664 0.007797 0.007394 0.000010 0.081994 0.563790
## 3 1003 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.999800
## 4 1004 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.933116
## 5 1005 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.993267
## 6 1006 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.999800
##         v8       v9      v10      v11      v12     v13   v14      v15      v16
## 1 0.017322 0.000010 0.063773 0.025953 0.038726 0.00024 1e-05 0.032562 0.010159
## 2 0.016013 0.000010 0.070222 0.031050 0.058734 0.00694 1e-05 0.024707 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001 1e-05 0.000010 0.000010
## 4 0.000010 0.006248 0.010398 0.015394 0.000010 0.00001 1e-05 0.002084 0.000010
## 5 0.000010 0.000010 0.000010 0.001392 0.000010 0.00001 1e-05 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001 1e-05 0.000010 0.000010
##        v17      v18      v19      v20      v21
## 1 0.000010 0.037947 0.000010 0.003427 0.027193
## 2 0.000010 0.006611 0.009499 0.014407 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.020956 0.011665 0.000010 0.000010 0.000010
## 5 0.000467 0.000010 0.000010 0.004705 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_21 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "chocolate4",
    "#B22222",
    "purple",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#1E90FF",
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "#008080",
    "goldenrod3",
    "orangered2"
    )

3.1.2 Plot individual admixture for k=21

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix21 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:21)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_21[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=21.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_21) +
  expand_limits(y = c(0, 1.5))

color_palette_21 <-
  c(
    "#77DD77",
    "yellow2",
    "navy", 
    "#FFFF99",
    "#AE9393",
    "orangered2", 
    "#FF8C1A",
    "#B20CC9",
    "chocolate4",
    "gray31",
    "#B22222",
    "magenta",    
    "purple4", 
    "coral", 
    "#1E90FF",
    "purple",
    "#008080",
    "green",
    "#75FAFF",
    "blue",
    "#F49AC2"
    )
    #"turquoise2",
    #"lemonchiffon2",
    #"hotpink"


ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=21.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_21) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/ admixture_euro_global_k21_r01_MAF1.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf)

3.2 Admixture plot for K=18

3.2.1 Import the Q matrix (K18 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix18 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run1/r2_0.01_b.18.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix18)
## # A tibble: 6 × 18
##      X1      X2      X3       X4      X5      X6      X7      X8      X9     X10
##   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.634 0.00128 0.0183  0.00001  0.0413  0.00001 0.00001 0.0272  0.0458  1   e-5
## 2 0.574 0.00755 0.0184  0.00001  0.0179  0.0128  0.00001 0.0342  0.0558  1.03e-2
## 3 0.999 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001 1   e-5
## 4 0.932 0.00001 0.00001 0.0251   0.00001 0.00322 0.00828 0.0158  0.00804 1   e-5
## 5 0.988 0.00001 0.00001 0.000647 0.00001 0.00001 0.00001 0.00181 0.00001 1   e-5
## 6 1.00  0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001 1.80e-5
## # ℹ 8 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix18 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix18)

head(admix18)
##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1001 OKI 0.633500 0.001284 0.018338 0.000010 0.041348 0.000010 0.000010
## 2 1002 OKI 0.574396 0.007550 0.018424 0.000010 0.017869 0.012802 0.000010
## 3 1003 OKI 0.999363 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.931808 0.000010 0.000010 0.025132 0.000010 0.003222 0.008278
## 5 1005 OKI 0.988364 0.000010 0.000010 0.000647 0.000010 0.000010 0.000010
## 6 1006 OKI 0.999822 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         X8       X9      X10      X11      X12      X13      X14      X15
## 1 0.027194 0.045802 0.000010 0.083504 0.018486 0.036335 0.060839 0.009407
## 2 0.034247 0.055830 0.010255 0.021814 0.000010 0.027965 0.091045 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000477 0.000010 0.000010
## 4 0.015805 0.008035 0.000010 0.000010 0.000010 0.002449 0.000010 0.001724
## 5 0.001810 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 0.000018 0.000010 0.000010 0.000010 0.000010 0.000010
##        X16      X17      X18
## 1 0.009535 0.004749 0.009639
## 2 0.090179 0.014823 0.022762
## 3 0.000010 0.000010 0.000010
## 4 0.000010 0.003457 0.000010
## 5 0.000010 0.009038 0.000010
## 6 0.000010 0.000010 0.000010

Rename the columns

# Rename the columns starting from the third one
admix18 <- admix18 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix18)
##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1001 OKI 0.633500 0.001284 0.018338 0.000010 0.041348 0.000010 0.000010
## 2 1002 OKI 0.574396 0.007550 0.018424 0.000010 0.017869 0.012802 0.000010
## 3 1003 OKI 0.999363 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.931808 0.000010 0.000010 0.025132 0.000010 0.003222 0.008278
## 5 1005 OKI 0.988364 0.000010 0.000010 0.000647 0.000010 0.000010 0.000010
## 6 1006 OKI 0.999822 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         v8       v9      v10      v11      v12      v13      v14      v15
## 1 0.027194 0.045802 0.000010 0.083504 0.018486 0.036335 0.060839 0.009407
## 2 0.034247 0.055830 0.010255 0.021814 0.000010 0.027965 0.091045 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000477 0.000010 0.000010
## 4 0.015805 0.008035 0.000010 0.000010 0.000010 0.002449 0.000010 0.001724
## 5 0.001810 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 0.000018 0.000010 0.000010 0.000010 0.000010 0.000010
##        v16      v17      v18
## 1 0.009535 0.004749 0.009639
## 2 0.090179 0.014823 0.022762
## 3 0.000010 0.000010 0.000010
## 4 0.000010 0.003457 0.000010
## 5 0.000010 0.009038 0.000010
## 6 0.000010 0.000010 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_18 <-
  c(
    "#77DD77",
    "yellow2",
    "navy", 
    "#FFFF99",
    "#AE9393",
    "orangered2", 
    "#FF8C1A",
    "chocolate4",
    "#B22222",
    "magenta",    
    "purple4", 
    "#1E90FF",
    "purple",
    "#008080",
    "green",
    "#75FAFF",
    "blue",
    "#F49AC2"
    )

3.2.2 Plot individual admixture for K=18

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix18 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:18)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_18[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_18) +
  expand_limits(y = c(0, 1.5))

color_palette_18 <-
  c(
    "#75FAFF",
    "#1E90FF",
    "#F49AC2",    
    "orangered2", 
    "green",
    "yellow2",    
    "#77DD77",
    "#B22222",
    "#008080",
    "purple4",
    "blue",
    "#AE9393",
    "magenta",    
    "#B20CC9",
    "chocolate4",
    "purple",
    "#FF8C1A",
    "navy"
    )

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n Admixture for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_18) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k18_r01_MAF1.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.3 Admixture plot for K=19

3.3.1 Import the Q matrix (K19 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix19 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run2/r2_0.01_b.19.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix19)
## # A tibble: 6 × 19
##      X1      X2      X3      X4       X5      X6      X7      X8      X9     X10
##   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.519 0.0260  0.0103  0.00001 0.000321 0.0680  0.0462  0.0248  0.0889  0.00001
## 2 0.382 0.0257  0.0932  0.0100  0.0102   0.0945  0.0214  0.0321  0.0228  0.00001
## 3 1.00  0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## 4 1.00  0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## 5 1.00  0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## 6 1.00  0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## # ℹ 9 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix19 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix19)

head(admix19)
##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1001 OKI 0.518854 0.025995 0.010330 0.000010 0.000321 0.067973 0.046244
## 2 1002 OKI 0.382380 0.025672 0.093156 0.010032 0.010211 0.094469 0.021400
## 3 1003 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 1005 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 1006 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         X8       X9   X10      X11      X12      X13      X14      X15      X16
## 1 0.024756 0.088854 1e-05 0.069505 0.027061 0.015078 0.000010 0.000010 0.004824
## 2 0.032072 0.022774 1e-05 0.074544 0.028855 0.000010 0.011621 0.008761 0.000010
## 3 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##        X17   X18      X19
## 1 0.096006 1e-05 0.004150
## 2 0.174595 1e-05 0.009418
## 3 0.000010 1e-05 0.000010
## 4 0.000010 1e-05 0.000010
## 5 0.000010 1e-05 0.000010
## 6 0.000010 1e-05 0.000010

Rename the columns

# Rename the columns starting from the third one
admix19 <- admix19 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix19)
##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1001 OKI 0.518854 0.025995 0.010330 0.000010 0.000321 0.067973 0.046244
## 2 1002 OKI 0.382380 0.025672 0.093156 0.010032 0.010211 0.094469 0.021400
## 3 1003 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 1005 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 1006 OKI 0.999820 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         v8       v9   v10      v11      v12      v13      v14      v15      v16
## 1 0.024756 0.088854 1e-05 0.069505 0.027061 0.015078 0.000010 0.000010 0.004824
## 2 0.032072 0.022774 1e-05 0.074544 0.028855 0.000010 0.011621 0.008761 0.000010
## 3 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##        v17   v18      v19
## 1 0.096006 1e-05 0.004150
## 2 0.174595 1e-05 0.009418
## 3 0.000010 1e-05 0.000010
## 4 0.000010 1e-05 0.000010
## 5 0.000010 1e-05 0.000010
## 6 0.000010 1e-05 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_19 <-
  c(
    "#77DD77",
    "yellow2",
    "navy", 
    "#FFFF99",
    "#AE9393",
    "orangered2", 
    "#FF8C1A",
    "chocolate4",
    "#B22222",
    "magenta",    
    "purple4", 
    "#1E90FF",
    "purple",
    "#008080",
    "green",
    "#75FAFF",
    "blue",
    "#F49AC2",
    "#B20CC9"
    )

3.3.2 Plot individual admixture for k=19

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix19 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:19)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_19[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=19.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_19) +
  expand_limits(y = c(0, 1.5))

color_palette_19 <-
  c(
    "chocolate4",
    "purple",
    "#AE9393",
    "orangered2", 
    "#77DD77",
    "#FFFF99",
    "#FF8C1A",
    "#008080",
    "green",
    "navy", 
    "green4",
    "blue",
    "magenta",  
    "#B22222",
    "#F49AC2",
    "#75FAFF",
    "#1E90FF",
    "yellow2",
    "purple4"
  )

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=19.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_19) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k19_r01_MAF1.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.4 Admixture plot for K=20

3.4.1 Import the Q matrix (K20 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix20 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run3/r2_0.01_b.20.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix20)
## # A tibble: 6 × 20
##        X1      X2      X3      X4      X5      X6      X7      X8     X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>
## 1 0.0254  0.0111  0.0104  0.0168  0.00710 0.00334 0.00001 0.0443    1e-5 1.10e-5
## 2 0.0315  0.0918  0.00001 0.0222  0.00408 0.00889 0.00884 0.0224    1e-5 2.79e-3
## 3 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 1   e-5
## 4 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 1   e-5
## 5 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 1   e-5
## 6 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 1   e-5
## # ℹ 10 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>

The fam file

fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix20 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix20)

head(admix20)
##    ind pop       X1       X2       X3       X4       X5       X6      X7
## 1 1001 OKI 0.025351 0.011117 0.010441 0.016805 0.007102 0.003340 0.00001
## 2 1002 OKI 0.031522 0.091752 0.000010 0.022249 0.004084 0.008889 0.00884
## 3 1003 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
## 4 1004 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
## 5 1005 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
## 6 1006 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
##         X8    X9      X10      X11      X12   X13      X14   X15      X16
## 1 0.044306 1e-05 0.000011 0.092170 0.000010 1e-05 0.057660 1e-05 0.035908
## 2 0.022441 1e-05 0.002791 0.172939 0.011705 1e-05 0.098137 1e-05 0.030885
## 3 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 4 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 5 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 6 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
##        X17      X18      X19      X20
## 1 0.057432 0.043066 0.082289 0.512953
## 2 0.071283 0.014281 0.027687 0.380475
## 3 0.000010 0.000010 0.000010 0.999810
## 4 0.000010 0.000010 0.000010 0.999810
## 5 0.000010 0.000010 0.000010 0.999810
## 6 0.000010 0.000010 0.000010 0.999810

Rename the columns

# Rename the columns starting from the third one
admix20 <- admix20 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix20)
##    ind pop       v1       v2       v3       v4       v5       v6      v7
## 1 1001 OKI 0.025351 0.011117 0.010441 0.016805 0.007102 0.003340 0.00001
## 2 1002 OKI 0.031522 0.091752 0.000010 0.022249 0.004084 0.008889 0.00884
## 3 1003 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
## 4 1004 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
## 5 1005 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
## 6 1006 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.00001
##         v8    v9      v10      v11      v12   v13      v14   v15      v16
## 1 0.044306 1e-05 0.000011 0.092170 0.000010 1e-05 0.057660 1e-05 0.035908
## 2 0.022441 1e-05 0.002791 0.172939 0.011705 1e-05 0.098137 1e-05 0.030885
## 3 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 4 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 5 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
## 6 0.000010 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 1e-05 0.000010
##        v17      v18      v19      v20
## 1 0.057432 0.043066 0.082289 0.512953
## 2 0.071283 0.014281 0.027687 0.380475
## 3 0.000010 0.000010 0.000010 0.999810
## 4 0.000010 0.000010 0.000010 0.999810
## 5 0.000010 0.000010 0.000010 0.999810
## 6 0.000010 0.000010 0.000010 0.999810

Import Sample Locations

sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_20 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "chocolate4",
    "#B22222",
    "purple",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#1E90FF",
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "navy", 
    "green",
    "#008080",
    "goldenrod3",
    "orangered2"
    )

3.4.2 Plot individual admixture for K=20

# Create a combined variable for Region and Country
source(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix20 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:20)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_20[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_20) +
  expand_limits(y = c(0, 1.5))

color_palette_20 <-
  c(
    "coral",
    "navy", 
    "#AE9393",
    "#1E90FF",
    "#75FAFF",
    "magenta",
    "#B20CC9",
    "green",
    "blue",
    "#77DD77",
    "#B22222",
    "purple",
    "#FF8C1A",
    "yellow2",
    "#008080",
    "purple4",
    "orangered2",
    "#FFFF99",
    "green4",
    "#F49AC2"
    )

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_20) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k20_r01_MAF1.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.5 Admixture plot for K=17

3.5.1 Import the Q matrix (K17 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix17 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run5/r2_0.01_b.17.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix17)
## # A tibble: 6 × 17
##        X1      X2      X3      X4      X5      X6      X7      X8      X9
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.0700  0.00001 0.00001 0.00001 0.0629  0.0392  0.00001 0.00116 0.00647
## 2 0.0197  0.0100  0.00977 0.00916 0.0908  0.0177  0.00001 0.00835 0.00001
## 3 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00812 0.00001 0.00200
## 5 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## 6 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001
## # ℹ 8 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## #   X15 <dbl>, X16 <dbl>, X17 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix17 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix17)

head(admix17)
##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1001 OKI 0.070045 0.000010 0.000010 0.000010 0.062902 0.039175 0.000010
## 2 1002 OKI 0.019664 0.010023 0.009771 0.009156 0.090758 0.017670 0.000010
## 3 1003 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.008117
## 5 1005 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 1006 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         X8       X9      X10      X11      X12      X13      X14      X15
## 1 0.001156 0.006473 0.027931 0.077951 0.631822 0.015124 0.016972 0.035849
## 2 0.008349 0.000010 0.033731 0.078257 0.573829 0.000010 0.020668 0.035796
## 3 0.000010 0.000010 0.000010 0.000010 0.999840 0.000010 0.000010 0.000010
## 4 0.000010 0.002005 0.015024 0.014176 0.931584 0.000010 0.000010 0.003291
## 5 0.000010 0.000010 0.002707 0.000010 0.992180 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.999840 0.000010 0.000010 0.000010
##       X16      X17
## 1 0.01455 0.000010
## 2 0.09229 0.000010
## 3 0.00001 0.000010
## 4 0.00001 0.025703
## 5 0.00001 0.004972
## 6 0.00001 0.000010

Rename the columns

# Rename the columns starting from the third one
admix17 <- admix17 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix17)
##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1001 OKI 0.070045 0.000010 0.000010 0.000010 0.062902 0.039175 0.000010
## 2 1002 OKI 0.019664 0.010023 0.009771 0.009156 0.090758 0.017670 0.000010
## 3 1003 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.008117
## 5 1005 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 1006 OKI 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         v8       v9      v10      v11      v12      v13      v14      v15
## 1 0.001156 0.006473 0.027931 0.077951 0.631822 0.015124 0.016972 0.035849
## 2 0.008349 0.000010 0.033731 0.078257 0.573829 0.000010 0.020668 0.035796
## 3 0.000010 0.000010 0.000010 0.000010 0.999840 0.000010 0.000010 0.000010
## 4 0.000010 0.002005 0.015024 0.014176 0.931584 0.000010 0.000010 0.003291
## 5 0.000010 0.000010 0.002707 0.000010 0.992180 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.999840 0.000010 0.000010 0.000010
##       v16      v17
## 1 0.01455 0.000010
## 2 0.09229 0.000010
## 3 0.00001 0.000010
## 4 0.00001 0.025703
## 5 0.00001 0.004972
## 6 0.00001 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_17 <-
  c(
    "#77DD77",
    "yellow2",
    "navy", 
    "#AE9393",
    "orangered2", 
    "#FF8C1A",
    "chocolate4",
    "#B22222",
    "magenta",    
    "purple4", 
    "#1E90FF",
    "purple",
    "#008080",
    "green",
    "#75FAFF",
    "blue",
    "#F49AC2"
    )
#    "#FFFF99",

3.5.2 Plot individual admixture for K=17

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix17 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:17)


# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_17[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=17.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_17) +
  expand_limits(y = c(0, 1.5))

color_palette_17 <-
  c(
    "#1E90FF",
    "yellow2",
    "purple4", 
    "#FF8C1A",
    "#FFFF99",    
    "navy", 
    "#008080",
    "green",
    "#F49AC2",
    "#B22222",    
    "purple",
    "#AE9393",    
    "#77DD77",
    "#75FAFF",
    "orangered2", 
    "blue",
    "magenta"  
    )

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=17.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_17) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k17_r01_MAF1.pdf"
  ),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.6 Admixture plot for K=22

3.6.1 Import the Q matrix (K=22 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix22 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run2/r2_0.01_b.22.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix22)
## # A tibble: 6 × 22
##      X1      X2      X3      X4       X5      X6      X7      X8      X9     X10
##   <dbl>   <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 0.618 0.0180  0.0457  0.0477  0.000012 0.00964 0.0665  0.00217 0.00001 0.00366
## 2 0.565 0.0161  0.0524  0.00001 0.00261  0.0856  0.0228  0.0102  0.00001 0.00172
## 3 1.00  0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## 4 0.900 0.00001 0.0134  0.0184  0.00001  0.00001 0.00001 0.00001 0.00565 0.00001
## 5 0.958 0.00001 0.00386 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## 6 1.00  0.00001 0.00001 0.00001 0.00001  0.00001 0.00001 0.00001 0.00001 0.00001
## # ℹ 12 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix22 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix22)

head(admix22)
##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1001 OKI 0.617735 0.017975 0.045659 0.047731 0.000012 0.009635 0.066494
## 2 1002 OKI 0.565166 0.016065 0.052427 0.000010 0.002608 0.085642 0.022794
## 3 1003 OKI 0.999790 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.900365 0.000010 0.013402 0.018387 0.000010 0.000010 0.000010
## 5 1005 OKI 0.958407 0.000010 0.003861 0.000010 0.000010 0.000010 0.000010
## 6 1006 OKI 0.999790 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         X8       X9      X10      X11      X12      X13      X14      X15
## 1 0.002170 0.000010 0.003662 0.038685 0.048688 0.001629 0.026636 0.000010
## 2 0.010225 0.000010 0.001715 0.014987 0.085294 0.013219 0.033936 0.012333
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.005647 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 0.000010 0.000010 0.000010 0.000010 0.000010 0.004520 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##        X16      X17      X18      X19      X20      X21      X22
## 1 0.000154 0.033072 0.000010 0.027588 0.000723 0.000010 0.011713
## 2 0.005227 0.026089 0.004688 0.031818 0.000010 0.015727 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.000010 0.000010 0.008953 0.053087 0.000010 0.000010
## 5 0.000010 0.000010 0.000010 0.000010 0.033031 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010

Rename the columns

# Rename the columns starting from the third one
admix22 <- admix22 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix22)
##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1001 OKI 0.617735 0.017975 0.045659 0.047731 0.000012 0.009635 0.066494
## 2 1002 OKI 0.565166 0.016065 0.052427 0.000010 0.002608 0.085642 0.022794
## 3 1003 OKI 0.999790 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 1004 OKI 0.900365 0.000010 0.013402 0.018387 0.000010 0.000010 0.000010
## 5 1005 OKI 0.958407 0.000010 0.003861 0.000010 0.000010 0.000010 0.000010
## 6 1006 OKI 0.999790 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##         v8       v9      v10      v11      v12      v13      v14      v15
## 1 0.002170 0.000010 0.003662 0.038685 0.048688 0.001629 0.026636 0.000010
## 2 0.010225 0.000010 0.001715 0.014987 0.085294 0.013219 0.033936 0.012333
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.005647 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 0.000010 0.000010 0.000010 0.000010 0.000010 0.004520 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
##        v16      v17      v18      v19      v20      v21      v22
## 1 0.000154 0.033072 0.000010 0.027588 0.000723 0.000010 0.011713
## 2 0.005227 0.026089 0.004688 0.031818 0.000010 0.015727 0.000010
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.000010 0.000010 0.008953 0.053087 0.000010 0.000010
## 5 0.000010 0.000010 0.000010 0.000010 0.033031 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_22 <-
  c(
    "#FF8C1A",
    "yellow2",
    "#77DD77",
    "#1E90FF",
    "#B22222",
    "chocolate4",
    "#B20CC9",
    "#F49AC2",
    "blue",
    "#008080", 
    "purple4",
    "#FFFF99",
    "#75FAFF",
    "#AE9393",
    "magenta",
    "green4",
    "navy", 
    "green",
    "purple",
    "orangered2",
    "goldenrod3",
    "coral")

3.6.2 Plot individual admixture for K=22

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix22 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:22)

color_palette_22 <-c(
    "green4",
    "coral", 
    "purple4",   
    "goldenrod3",
    "yellow2",
    "#FF8C1A",
    "magenta",
    "#FFFF99",    
    "orangered2",
    "#75FAFF",
    "#B22222",
    "#F49AC2",
    "#1E90FF",
    "#77DD77",    
    "#AE9393",
    "navy", 
    "chocolate4",
    "#B20CC9",
    "#008080",     
    "blue",   
    "green",
    "purple"    
) 
  
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_22[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=22.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_22) +
  expand_limits(y = c(0, 1.5))

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=22.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_22) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k22_r01_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.7 Admixture plot for K=6

3.7.1 Import the Q matrix (K=6 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix6 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run5/r2_0.01_b.6.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix6)
## # A tibble: 6 × 6
##      X1    X2      X3    X4    X5     X6
##   <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>
## 1 0.198 0.175 0.00001 0.466 0.122 0.0380
## 2 0.218 0.163 0.0109  0.434 0.124 0.0498
## 3 0.204 0.134 0.00527 0.482 0.132 0.0430
## 4 0.204 0.200 0.00001 0.422 0.109 0.0649
## 5 0.188 0.176 0.00001 0.445 0.128 0.0621
## 6 0.212 0.168 0.00001 0.464 0.110 0.0451

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Merge columns "FamilyID" and "IndividualID" with an underscore
# fam_data$ind <- paste(fam_data$FamilyID, fam_data$IndividualID, sep = "_")

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix6 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix6)

head(admix6)
##    ind pop       X1       X2       X3       X4       X5       X6
## 1 1001 OKI 0.198226 0.174974 0.000010 0.466489 0.122310 0.037991
## 2 1002 OKI 0.218327 0.163059 0.010860 0.433607 0.124358 0.049790
## 3 1003 OKI 0.203547 0.133653 0.005273 0.482053 0.132470 0.043004
## 4 1004 OKI 0.203962 0.199981 0.000010 0.421775 0.109376 0.064895
## 5 1005 OKI 0.188237 0.176176 0.000010 0.445296 0.128135 0.062147
## 6 1006 OKI 0.211846 0.168153 0.000010 0.464497 0.110375 0.045119

Rename the columns

# Rename the columns starting from the third one
admix6 <- admix6 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix6)
##    ind pop       v1       v2       v3       v4       v5       v6
## 1 1001 OKI 0.198226 0.174974 0.000010 0.466489 0.122310 0.037991
## 2 1002 OKI 0.218327 0.163059 0.010860 0.433607 0.124358 0.049790
## 3 1003 OKI 0.203547 0.133653 0.005273 0.482053 0.132470 0.043004
## 4 1004 OKI 0.203962 0.199981 0.000010 0.421775 0.109376 0.064895
## 5 1005 OKI 0.188237 0.176176 0.000010 0.445296 0.128135 0.062147
## 6 1006 OKI 0.211846 0.168153 0.000010 0.464497 0.110375 0.045119

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_6 <-
    c(
    "#77DD37",  
    "red",
    "purple3",
    "#FFFF19",
    "#1E90FF",
    "#FF8C1A"
        )

3.7.2 Plot individual admixture for K=6

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix6 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:6)

color_palette_6 <-
c(
    "purple3",
    "#1E90FF",
    "#FFFF19",
    "#77DD37", 
    "red",
    "#FF8C1A"
)
 
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                           color = color_palette_6[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_6) +
  expand_limits(y = c(0, 1.5))

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_6) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k6_r01_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.8 Admixture plot for K=5

3.8.1 Import the Q matrix (K=5 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix5 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run3/r2_0.01_b.5.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix5)
## # A tibble: 6 × 5
##      X1    X2      X3    X4    X5
##   <dbl> <dbl>   <dbl> <dbl> <dbl>
## 1 0.200 0.452 0.00001 0.224 0.124
## 2 0.219 0.434 0.0118  0.209 0.126
## 3 0.203 0.472 0.00686 0.184 0.134
## 4 0.205 0.440 0.00001 0.244 0.111
## 5 0.189 0.459 0.00001 0.222 0.129
## 6 0.213 0.458 0.00001 0.216 0.112

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix5)

head(admix5)
##    ind pop       X1       X2       X3       X4       X5
## 1 1001 OKI 0.199841 0.451968 0.000010 0.224482 0.123700
## 2 1002 OKI 0.219298 0.434467 0.011839 0.208849 0.125548
## 3 1003 OKI 0.203232 0.471849 0.006857 0.183926 0.134136
## 4 1004 OKI 0.205200 0.440208 0.000010 0.243953 0.110628
## 5 1005 OKI 0.189489 0.458913 0.000010 0.222154 0.129433
## 6 1006 OKI 0.213428 0.458036 0.000010 0.216318 0.112208

Rename the columns

# Rename the columns starting from the third one
admix5 <- admix5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix5)
##    ind pop       v1       v2       v3       v4       v5
## 1 1001 OKI 0.199841 0.451968 0.000010 0.224482 0.123700
## 2 1002 OKI 0.219298 0.434467 0.011839 0.208849 0.125548
## 3 1003 OKI 0.203232 0.471849 0.006857 0.183926 0.134136
## 4 1004 OKI 0.205200 0.440208 0.000010 0.243953 0.110628
## 5 1005 OKI 0.189489 0.458913 0.000010 0.222154 0.129433
## 6 1006 OKI 0.213428 0.458036 0.000010 0.216318 0.112208

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

3.8.2 Plot individual admixture for K=5

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

color_palette_5 <-
  c(
    "red",
    "#1E90FF",
    "#FF8C1A",    
    "purple3",
    "#FFFF19"
     )

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_5[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_5) +
  expand_limits(y = c(0, 1.5))

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n Admixture for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_5) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k5_r01_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.9 Admixture plot for K=7

3.9.1 Import the Q matrix (K=7 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix7 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run5/r2_0.01_b.7.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix7)
## # A tibble: 6 × 7
##       X1       X2     X3    X4      X5    X6      X7
##    <dbl>    <dbl>  <dbl> <dbl>   <dbl> <dbl>   <dbl>
## 1 0.109  0.000686 0.127  0.194 0.00508 0.511 0.0537 
## 2 0.113  0.0133   0.119  0.214 0.0158  0.490 0.0355 
## 3 0.113  0.00752  0.0765 0.195 0.00419 0.598 0.00500
## 4 0.0916 0.00001  0.145  0.197 0.0259  0.541 0.00001
## 5 0.110  0.00001  0.119  0.182 0.0219  0.567 0.00001
## 6 0.0933 0.00001  0.117  0.206 0.0104  0.573 0.00001

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix7 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix7)

head(admix7)
##    ind pop       X1       X2       X3       X4       X5       X6       X7
## 1 1001 OKI 0.108630 0.000686 0.127234 0.194023 0.005080 0.510635 0.053712
## 2 1002 OKI 0.112784 0.013302 0.119158 0.213851 0.015833 0.489583 0.035489
## 3 1003 OKI 0.113346 0.007519 0.076485 0.195454 0.004194 0.598000 0.005002
## 4 1004 OKI 0.091561 0.000010 0.144644 0.197088 0.025910 0.540777 0.000010
## 5 1005 OKI 0.109764 0.000010 0.118556 0.182388 0.021868 0.567404 0.000010
## 6 1006 OKI 0.093288 0.000010 0.116956 0.206377 0.010426 0.572933 0.000010

Rename the columns

# Rename the columns starting from the third one
admix7 <- admix7 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix7)
##    ind pop       v1       v2       v3       v4       v5       v6       v7
## 1 1001 OKI 0.108630 0.000686 0.127234 0.194023 0.005080 0.510635 0.053712
## 2 1002 OKI 0.112784 0.013302 0.119158 0.213851 0.015833 0.489583 0.035489
## 3 1003 OKI 0.113346 0.007519 0.076485 0.195454 0.004194 0.598000 0.005002
## 4 1004 OKI 0.091561 0.000010 0.144644 0.197088 0.025910 0.540777 0.000010
## 5 1005 OKI 0.109764 0.000010 0.118556 0.182388 0.021868 0.567404 0.000010
## 6 1006 OKI 0.093288 0.000010 0.116956 0.206377 0.010426 0.572933 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80

3.9.2 Plot individual admixture for K=7

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix7 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:7)

color_palette_7 <-
  c(
    "red",
    "#77DD37",  
    "#FF8C1A",    
    "#FFFF19",
    "#75FAFF", 
    "#1E90FF",   
    "purple3")

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_7[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_7) +
  expand_limits(y = c(0, 1.5))

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=7.\n Admixture for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_7) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k7_r01_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)

3.10 Admixture plot for K=23

3.10.1 Import the Q matrix (K=23 for admixture)

Select a Q matrix from one of the runs for the best k

# Extract ancestry coefficients
admix23 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/admixture/MAF_1/run4/r2_0.01_b.23.Q"),
  delim = " ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 

head(admix23)
## # A tibble: 6 × 23
##        X1      X2      X3      X4      X5      X6      X7      X8     X9     X10
##     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>  <dbl>   <dbl>
## 1 0.00001 0.00614 0.0555  0.00001 0.00001 0.0213  0.00344 0.0575    1e-5 0.00981
## 2 0.00001 0.00339 0.0646  0.00256 0.00001 0.0298  0.0111  0.0974    1e-5 0.00001
## 3 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 0.00001
## 4 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 0.00001
## 5 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 0.00001
## 6 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001 0.00001   1e-5 0.00001
## # ℹ 13 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## #   X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>,
## #   X22 <dbl>, X23 <dbl>

The fam file

fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      OKI         1001          0          0   2        -9
## 2      OKI         1002          0          0   2        -9
## 3      OKI         1003          0          0   2        -9
## 4      OKI         1004          0          0   2        -9
## 5      OKI         1005          0          0   2        -9
## 6      OKI         1006          0          0   1        -9

Create ID column

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI

Add it to the matrix

admix23 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(admix23)

head(admix23)
##    ind pop    X1       X2       X3       X4    X5       X6       X7       X8
## 1 1001 OKI 1e-05 0.006135 0.055534 0.000010 1e-05 0.021253 0.003443 0.057489
## 2 1002 OKI 1e-05 0.003390 0.064625 0.002563 1e-05 0.029792 0.011054 0.097427
## 3 1003 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 4 1004 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 5 1005 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 6 1006 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
##      X9      X10      X11      X12      X13      X14      X15   X16      X17
## 1 1e-05 0.009813 0.000010 0.005947 0.040916 0.089753 0.513623 1e-05 0.099794
## 2 1e-05 0.000010 0.011137 0.088533 0.017282 0.171202 0.377893 1e-05 0.035208
## 3 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
## 4 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
## 5 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
## 6 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
##        X18      X19      X20      X21      X22      X23
## 1 0.024905 0.000010 0.043611 0.000010 0.025067 0.002636
## 2 0.022834 0.008896 0.016482 0.011952 0.021553 0.008125
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010

Rename the columns

# Rename the columns starting from the third one
admix23 <- admix23 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

# View the first few rows
head(admix23)
##    ind pop    v1       v2       v3       v4    v5       v6       v7       v8
## 1 1001 OKI 1e-05 0.006135 0.055534 0.000010 1e-05 0.021253 0.003443 0.057489
## 2 1002 OKI 1e-05 0.003390 0.064625 0.002563 1e-05 0.029792 0.011054 0.097427
## 3 1003 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 4 1004 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 5 1005 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
## 6 1006 OKI 1e-05 0.000010 0.000010 0.000010 1e-05 0.000010 0.000010 0.000010
##      v9      v10      v11      v12      v13      v14      v15   v16      v17
## 1 1e-05 0.009813 0.000010 0.005947 0.040916 0.089753 0.513623 1e-05 0.099794
## 2 1e-05 0.000010 0.011137 0.088533 0.017282 0.171202 0.377893 1e-05 0.035208
## 3 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
## 4 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
## 5 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
## 6 1e-05 0.000010 0.000010 0.000010 0.000010 0.000010 0.999780 1e-05 0.000010
##        v18      v19      v20      v21      v22      v23
## 1 0.024905 0.000010 0.043611 0.000010 0.025067 0.002636
## 2 0.022834 0.008896 0.016482 0.011952 0.021553 0.008125
## 3 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 4 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 5 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010
## 6 0.000010 0.000010 0.000010 0.000010 0.000010 0.000010

Import Sample Locations

sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_23 <-
  c(
    "purple4",
    "purple", 
    "orangered",
    "#FF8C1A",
    "#F49AC2",
    "magenta",
    "#AE9393",
    "#FFFF99",
    "orchid1",
    "#1E90FF",
    "chocolate4",
    "#B20CC9",
    "goldenrod",
    "#77DD77",
    "blue",
    "navy", 
    "green",
    "green4", 
    "#B22222",
    "#008080", 
    "coral",
    "yellow2",
    "#75FAFF"
      )

3.10.2 Plot individual admixture for K=23

# Create a combined variable for Region and Country
source(
  here("scripts", "RMarkdowns",
    "my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- admix23 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:23)

  
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_23[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=23.\n LEA inference for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_23) +
  expand_limits(y = c(0, 1.5))

color_palette_23 <-
  c(
    "coral",
    "#1E90FF",    
    "#FFFF99",    
    "green4", 
    "purple4",    
    "#F49AC2",
    "magenta",
    "yellow2",
    "navy",     
    "#75FAFF",
    "orangered",
    "chocolate4",
    "#77DD77",
    "green",
    "#FF8C1A",
    "purple", 
    "blue",    
    "#B22222",
    "goldenrod",
    "#008080", 
    "orchid1",
    "#AE9393",    
    "#B20CC9"   
  )
  

ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=23.\n Admixture  for 22,642 (MAF 1%) SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_23) +
  expand_limits(y = c(0, 1.5))

ggsave(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/euro_global/admixture/MAF_1/admixture_euro_global_k23_r01_MAF1.pdf"),
  width  = 12,
  height = 6,
  units  = "in",
  device = cairo_pdf
)