Load libraries.

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
#library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
library(hrbrthemes)         # not really needed, personal preference for plots
#library(hrbrmisc)           # not really needed, personal preference for plots
# library(stringr)          # for strings operations
#library(ggstatsplot)        # statistics and plotting
library(flextable)          # create tables
library(officer)            # export office format

SNP Set 3

Run fastStructure for SNP Set 3 (MAF filtering at 1%) for European populations

1. Simple fastStructure (MAF 1%)

1.1. SNP Set 3: Run Simple prior in fastStructure for file with ~21k SNPs (from r2_0.01_b); this snp set has LD filtering at r2<0.01 & MAF 1%

cd /gpfs/gibbs/pi/caccone/mkc54/albo/data/europe

1.1.1. Check populations in the dataset

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam | sort | uniq -c | awk '{print $2}' 
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## IMP
## ITB
## ITP
## ITR
## KER
## KRA
## MAL
## POL
## POP
## RAR
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPM
## SPS
## STS
## TIK
## TIR
## TRE
## TUA
## TUH

Count the total number of population

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam | sort | uniq -c | wc -l
## 41

1.1.2 Create a directory for each run.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
for i in $(seq -w 1 100)
do
  mkdir run$i
done
ls #should have 100 folders now

1.1.3 Create the dsq files for SNP Set 3

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
module load dSQ/1.05 

for k in $(seq 1 25); do
    for run in $(seq -w 1 100); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b --output=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run$run/simple --full --cv=10 --tol=10e-6"
    done
done > dsq_europe_MAF1.txt

head dsq_europe_MAF1.txt #check file

dsq --job-file dsq_europe_MAF1.txt --output /dev/null --mail-type ALL -t 24:00:00 --partition=ycga --cpus-per-task=10 --job-name fS.simple_europe_MAF1 --batch-file simple_europe_MAF1.sh

#Check the files to see if they have 2500 jobs:
head simple_europe_MAF1.sh

1.1.4. Run the jobs for MAF 1%

Run a random job from our array to see if works

sbatch --array=$((1 + $RANDOM % 500)) simple_europe_MAF1.sh

# check status
dsqa -j 18205004 #insert job number

#if it looks like its running ok, you can cancel it 
scancel 18205004 #insert job #

Submit all jobs.

sbatch simple_europe_MAF1.sh
#Submitted batch job # 18205283

#Check it
dsqa -j 18205283 #add job number here

1.1.5. Run autopsy once its done, for MAF 1%

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
module load dSQ/1.05

dsqa -j 18205283 -f dsq_europe_MAF1.txt -s TIMEOUT > re-run_jobs1.txt 2> 18205283_report1.txt; head 18205283_report1.txt
 #all completed

1.1.6. Find the optimal number of K for MAF 1%.

you will see it varies from run to run. You can estimate mean, mode and median values. Then you choose one that makes sense.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session

module load miniconda
conda activate fastStructure 

#First check files
ls run*/* | wc -l # should be 12500

# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
   #      --input=<file>

for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run*
do
    echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done
#Model complexity that maximizes marginal likelihood = 
#Model components used to explain structure in data = 

Get outputs in a file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1

cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run*
do
    echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' |  awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_europe_MAF_1_25k.txt

head simple_europe_MAF_1_25k.txt #check the file


cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
cp -r simple_europe_MAF_1_25k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe #copy file 

1.1.7. Make a plot with the Ks for MAF 1%.

Let’s import the data in R.

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
#library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
library(hrbrthemes)         # not really needed, personal preference for plots
#library(hrbrmisc)           # not really needed, personal preference for plots
# library(stringr)          # for strings operations
library(flextable)          # create tables
library(officer)            # export office format
library(here)
# function to import our choosek.py data
import_fastStructure <- function(file) {
  # import as a tibble and set columns as integers
  dat <- read_delim(
    file,
    col_names      = TRUE,
    show_col_types = FALSE,
    col_types      = "iii"
  )

  # get columns we need and make it long for plotting
  dat <- dat |>
    gather(
      structure, likelihood, -run
    )

  # rename the columns by index
  dat <- dat |>
    rename(
      run   = 1,
      model = 2,
      k     = 3
    )
  return(dat)
}

Run function to import MAF1% to R

choose_k_simple <- 
  import_fastStructure(
  here("output", "europe", "simple_europe_MAF_1_25k.txt"
  )
)
meank <- aggregate(choose_k_simple[, 3], list(choose_k_simple$model), mean)
meank
##      Group.1     k
## 1 likelihood 18.04
## 2  structure  3.68
mediank <-aggregate(choose_k_simple[, 3], list(choose_k_simple$model), median)
mediank
##      Group.1  k
## 1 likelihood 18
## 2  structure  1
mink <-aggregate(choose_k_simple[, 3], list(choose_k_simple$model), min)

maxk <-aggregate(choose_k_simple[, 3], list(choose_k_simple$model), max)

likelihood: 18.04 structure: 3.68

Function to plot fastStructure choosek.py results

# function to plot our choosek.py data
plot_fastStructure <- function(df) {
  df |>
    ggplot() +
    geom_line(
      aes(
        x              = run,
        y              = k,
        color          = model
      ),
      linewidth = 1
    ) +
    scale_colour_manual(
      "model",
      values = c(
        structure      = "#9d60ff",
        likelihood     = "#ffc08c"
      ),
      labels = c(
        "Maximizes \n Likelihood \n", "Explain \n Structure"
      )
    ) +
    labs(
      x                = "Run",
      y                = "K",
      title            = "fastStructure simple for Europe (20,968 SNPs)",
      caption          = "algorithm runs for choices of K ranging from 1 to 25"
    ) +
    theme(
      panel.grid.major = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      panel.grid.minor = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      legend.text = element_text(
        size = 12
      ),
      legend.title = element_text(
        size           = 14,
        face           = "bold"
      ),
      legend.position = "right"
    )
}

Use our function to plot k values

plot_fastStructure(
  choose_k_simple
) +
  labs(
    subtitle = "simple fastStructure for Europe (20,968 SNPs)"
  )

ggsave(
  here(
    "output", "europe", "figures", "fastStructure", "fastStructure_simple_k1_k25_europe_MAF_1.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

1.1.8. Find mean K maximizing likelihood

choose_k_likelihood <- subset(choose_k_simple, model != "structure")
mean(choose_k_likelihood$k) #18.04
## [1] 18.04
median(choose_k_likelihood$k) #18
## [1] 18

2. Plots for fastStructure simple r2<0.01, MAF 1%, Europe

Using ggplot2 for individual admixtures

2.1 Extract ancestry coefficients for k=15

change to the correct run & Q matrix

fask15 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run072/simple.15.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask15)
## # A tibble: 6 × 15
##         X1    X2    X3    X4    X5    X6    X7    X8      X9   X10     X11   X12
##      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>   <dbl> <dbl>
## 1 0.531     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6 3.75e-1  2e-6 2   e-6  2e-6
## 2 0.000002  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6 9.90e-1  2e-6 2   e-6  2e-6
## 3 0.000002  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6 2   e-6  2e-6 2   e-6  2e-6
## 4 0.399     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6 6.01e-1  2e-6 2   e-6  2e-6
## 5 0.578     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6 4.22e-1  2e-6 2   e-6  2e-6
## 6 0.232     2e-6  2e-6  2e-6  2e-6  2e-6  2e-6  2e-6 5.39e-1  2e-6 5.01e-2  2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>

The fam file

fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to the matrix

fask15 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask15)

head(fask15)
##    ind pop       X1    X2    X3    X4    X5    X6    X7    X8       X9   X10
## 1 1065 SOC 0.531236 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.375485 2e-06
## 2 1066 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.990424 2e-06
## 3 1067 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.398970 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.601009 2e-06
## 5 1069 SOC 0.578240 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.421739 2e-06
## 6 1070 SOC 0.231965 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.538995 2e-06
##        X11   X12      X13   X14   X15
## 1 0.000002 2e-06 0.093260 2e-06 2e-06
## 2 0.000002 2e-06 0.009555 2e-06 2e-06
## 3 0.000002 2e-06 0.999977 2e-06 2e-06
## 4 0.000002 2e-06 0.000002 2e-06 2e-06
## 5 0.000002 2e-06 0.000002 2e-06 2e-06
## 6 0.050104 2e-06 0.178918 2e-06 2e-06

Rename the columns

# Rename the columns starting from the third one
fask15 <- fask15 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

fask15$pop[fask15$pop == 'SCH'] <- 'STS'

# View the first few rows
head(fask15)
##    ind pop       v1    v2    v3    v4    v5    v6    v7    v8       v9   v10
## 1 1065 SOC 0.531236 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.375485 2e-06
## 2 1066 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.990424 2e-06
## 3 1067 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.398970 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.601009 2e-06
## 5 1069 SOC 0.578240 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.421739 2e-06
## 6 1070 SOC 0.231965 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.538995 2e-06
##        v11   v12      v13   v14   v15
## 1 0.000002 2e-06 0.093260 2e-06 2e-06
## 2 0.000002 2e-06 0.009555 2e-06 2e-06
## 3 0.000002 2e-06 0.999977 2e-06 2e-06
## 4 0.000002 2e-06 0.000002 2e-06 2e-06
## 5 0.000002 2e-06 0.000002 2e-06 2e-06
## 6 0.050104 2e-06 0.178918 2e-06 2e-06

Import Sample Locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_15 <-
  c("blue",
    "#FF8C1A",
    "purple4",  
    "#77DD77", 
    "orangered",
    "magenta",
    "#75FAFF",    
    "#F49AC2",  
    "#B22222",  
    "#B20CC9",    
    "green",    
    "#1E90FF",    
    "yellow2",
    "green4",
    "purple")
source(
  here("my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask15 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
#  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:15)

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_15[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n fastStructure inference with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_15) +
  expand_limits(y = c(0, 1.5))

2.2 Extract ancestry coefficients for k=18

change to correct run & Q matrix

fask18 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run093/simple.18.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask18)
## # A tibble: 6 × 18
##         X1    X2      X3    X4    X5    X6    X7    X8    X9   X10   X11     X12
##      <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 0.000001  1e-6 8.21e-1  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6
## 2 0.000001  1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1.00e+0
## 3 0.000001  1e-6 1   e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 1   e-6
## 4 0.0182    1e-6 5.38e-1  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 4.44e-1
## 5 0.000001  1e-6 6.77e-1  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 2.93e-1
## 6 0.0559    1e-6 4.29e-1  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6  1e-6 3.78e-1
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## #   X18 <dbl>

The fam file

fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to the matrix

fask18 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask18)

head(fask18)
##    ind pop       X1    X2       X3    X4    X5    X6    X7    X8    X9   X10
## 1 1065 SOC 0.000001 1e-06 0.820516 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 0.018240 1e-06 0.537642 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 0.000001 1e-06 0.677058 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 0.055852 1e-06 0.429497 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
##     X11      X12      X13      X14      X15   X16   X17   X18
## 1 1e-06 0.000001 0.000001 0.000001 0.179463 1e-06 1e-06 1e-06
## 2 1e-06 0.999977 0.000001 0.000001 0.000001 1e-06 1e-06 1e-06
## 3 1e-06 0.000001 0.000001 0.000001 0.999977 1e-06 1e-06 1e-06
## 4 1e-06 0.443530 0.000001 0.000569 0.000001 1e-06 1e-06 1e-06
## 5 1e-06 0.293325 0.029597 0.000001 0.000001 1e-06 1e-06 1e-06
## 6 1e-06 0.377923 0.004133 0.000001 0.132579 1e-06 1e-06 1e-06

Rename the columns

# Rename the columns starting from the third one
fask18 <- fask18 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

fask18$pop[fask18$pop == 'SCH'] <- 'STS'

# View the first few rows
head(fask18)
##    ind pop       v1    v2       v3    v4    v5    v6    v7    v8    v9   v10
## 1 1065 SOC 0.000001 1e-06 0.820516 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 0.018240 1e-06 0.537642 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 0.000001 1e-06 0.677058 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 0.055852 1e-06 0.429497 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
##     v11      v12      v13      v14      v15   v16   v17   v18
## 1 1e-06 0.000001 0.000001 0.000001 0.179463 1e-06 1e-06 1e-06
## 2 1e-06 0.999977 0.000001 0.000001 0.000001 1e-06 1e-06 1e-06
## 3 1e-06 0.000001 0.000001 0.000001 0.999977 1e-06 1e-06 1e-06
## 4 1e-06 0.443530 0.000001 0.000569 0.000001 1e-06 1e-06 1e-06
## 5 1e-06 0.293325 0.029597 0.000001 0.000001 1e-06 1e-06 1e-06
## 6 1e-06 0.377923 0.004133 0.000001 0.132579 1e-06 1e-06 1e-06

Import Sample Locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_18 <-
  c(    
    "#77DD77",    
    "#AE8333",    
    "#B22222",
    "purple",
    "green4",
    "#75FAFF",
    "orangered",
    "#B20CC9", 
    "#1E90FF",      
    "blue",
    "#FFFF99",
    "purple4",
    "navy",     
    "#FF8C1A",
    "magenta", 
    "#F49AC2",
    "green",
    "yellow2"
    )
source(
  here("my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask18 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
#  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:18)

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_18[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n fastStructure inference with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_18) +
  expand_limits(y = c(0, 1.5))

2.3 Extract ancestry coefficients for k=5

change to correct run & Q matrix

fask5 <- read_delim(
  here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run072/simple.5.meanQ"),
  delim = "  ", # Specify the delimiter if different from the default (comma)
  col_names = FALSE,
  show_col_types = FALSE
) 
head(fask5)
## # A tibble: 6 × 5
##         X1       X2       X3    X4       X5
##      <dbl>    <dbl>    <dbl> <dbl>    <dbl>
## 1 0.0533   0.000005 0.000005 0.945 0.00211 
## 2 0.000005 0.000005 0.000005 1.00  0.000005
## 3 0.000005 0.000005 0.000005 1.00  0.000005
## 4 0.000005 0.000005 0.000005 0.943 0.0572  
## 5 0.0400   0.000005 0.0794   0.881 0.000005
## 6 0.0991   0.000005 0.000005 0.901 0.000005

The fam file

fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")

# Read the .fam file
fam_data <- read.table(fam_file, 
                       header = FALSE,
                       col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))

# View the first few rows
head(fam_data)
##   FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1      SOC         1065          0          0   0        -9
## 2      SOC         1066          0          0   0        -9
## 3      SOC         1067          0          0   0        -9
## 4      SOC         1068          0          0   0        -9
## 5      SOC         1069          0          0   0        -9
## 6      SOC         1070          0          0   0        -9

Create column ID

# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"

# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"

# Select ID
fam_data <- fam_data |>
  dplyr::select("ind", "pop")

# View the first few rows
head(fam_data)
##    ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC

Add it to the matrix

fask5 <- fam_data |>
  dplyr::select(ind, pop) |>
  bind_cols(fask5)

head(fask5)
##    ind pop       X1    X2       X3       X4       X5
## 1 1065 SOC 0.053349 5e-06 0.000005 0.944530 0.002111
## 2 1066 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 3 1067 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 4 1068 SOC 0.000005 5e-06 0.000005 0.942767 0.057218
## 5 1069 SOC 0.040044 5e-06 0.079437 0.880509 0.000005
## 6 1070 SOC 0.099121 5e-06 0.000005 0.900864 0.000005

Rename the columns

# Rename the columns starting from the third one
fask5 <- fask5 |>
  rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))

fask5$pop[fask5$pop == 'SCH'] <- 'STS'

# View the first few rows
head(fask5)
##    ind pop       v1    v2       v3       v4       v5
## 1 1065 SOC 0.053349 5e-06 0.000005 0.944530 0.002111
## 2 1066 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 3 1067 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 4 1068 SOC 0.000005 5e-06 0.000005 0.942767 0.057218
## 5 1069 SOC 0.040044 5e-06 0.079437 0.880509 0.000005
## 6 1070 SOC 0.099121 5e-06 0.000005 0.900864 0.000005

Import Sample Locations

sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
head(sampling_loc)
##       Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1   Berlin, NJ     USA 39.79081  -74.9291  Americas          BER 2018
## 2 Columbus, OH     USA 39.97170  -82.9071  Americas          COL 2015
## 3   Palm Beach     USA 26.70560  -80.0364  Americas          PAL 2018
## 4  Houston, TX     USA 29.75491  -95.3505  Americas          HOU 2018
## 5  Los Angeles     USA 34.05220 -118.2437  Americas          LOS 2018
## 6   Manaus, AM  Brazil -3.09161  -60.0325  Americas          MAU 2017
##          Region Subregion order order2 orderold
## 1 North America               1     NA       75
## 2 North America               2     NA       76
## 3 North America               3     NA       77
## 4 North America               4     NA       78
## 5 North America               5     NA       79
## 6 South America               6     NA       80
color_palette_5 <-
  c(
    "#FFFF19",
    "#FF8C1A",
    "#77DD37",   
    "#1E90FF",
    "purple3"
        )
source(
  here("my_theme3.R"
  )
)

# Melt the data frame for plotting
Q_melted <- fask5 |>
  pivot_longer(
    cols = -c(ind, pop),
    names_to = "variable",
    values_to = "value"
  )
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
  left_join(sampling_loc, by = c("pop" = "Abbreviation"))

# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
#  mutate(Region_Country = interaction(Region, Country, sep = "_"))

# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
  arrange(order, ind) |>
  mutate(ind = factor(ind, levels = unique(ind)))  # Convert ind to a factor with levels in the desired order

# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
  group_by(Country) |>
  mutate(label = ifelse(row_number() == 1, as.character(Country), NA))

# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
  group_by(ind, variable) |>
  summarise(value = mean(value), .groups = "drop")

# Create a data frame for borders
borders <-
  data.frame(Country = unique(Q_ordered$Country))

# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
  sapply(borders$Country, function(rc)
    max(which(Q_ordered$Country == rc))) + 0.5  # Shift borders to the right edge of the bars

# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
  filter(!is.na(label)) |>
  distinct(label, .keep_all = TRUE)

# Create a custom label function
label_func <- function(x) {
  labels <- rep("", length(x))
  labels[x %in% label_df$ind] <- label_df$label
  labels
}

# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) + 0)

# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
  mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
  group_by(pop) |>
  slice_head(n = 1) |>
  ungroup() |>
  dplyr::select(ind, Pop_City, Country, Name) |>
  mutate(pos = as.numeric(ind))  # calculate position of population labels

pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)


# Calculate the position of lines
border_positions <- Q_ordered |>
  group_by(Country) |>
  summarise(pos = max(as.numeric(ind)) - 1)


pop_labels_bars <- pop_labels |>
  mutate(pos = as.numeric(ind)  - .5)

# Function to filter and normalize data
normalize_data <- function(df, min_value) {
  df |>
    filter(value > min_value) |>
    group_by(ind) |>
    mutate(value = value / sum(value))
}

# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# 

# Generate all potential variable names
all_variables <- paste0("v", 1:5)

# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
                            color = color_palette_5[1:length(all_variables)])

# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")

# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
  geom_bar(stat = 'identity', width = 1) +
  geom_vline(
    data = pop_labels_bars,
    aes(xintercept = pos),
    color = "#2C444A",
    linewidth = .2
  ) +
  geom_text(
    data = pop_labels,
    aes(x = as.numeric(ind), y = 1, label = Name),
    vjust = 1.5,
    hjust = 0,
    size = 2,
    angle = 90,
    inherit.aes = FALSE
  ) +
  my_theme() +
  theme(
    axis.text.x = element_text(
      angle = 90,
      hjust = 1,
      size = 12
    ),
    legend.position = "none",
    plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
  ) +
  xlab("Admixture matrix") +
  ylab("Ancestry proportions") +
  labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n fastStructure inference with 20,968 SNPs.") +
  scale_x_discrete(labels = label_func) +
  scale_fill_manual(values = color_palette_5) +
  expand_limits(y = c(0, 1.5))

3. Run admixture with SNP Set 3 (MAF 1%) for European populations

3.1 Load the libraries

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(flextable)          # create tables
#library(ggstatsplot)        # statistics and plotting
# library(ggplot2)          # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
# library(hrbrthemes)       # not really needed, personal preference for plots
# library(hrbrmisc)         # not really needed, personal preference for plots
library(stringr)            # for strings operations
library(flextable)          # create tables
library(officer)            # export office format

3.2 Export path

# Add this in your script
export PATH=$PATH:/home/mkc54/admixture/releases/admixture_linux-1.3.0/

# Then call admixture from anywhere
admixture -h

3.3 Run admixture

3.3.1. Create directories for MAF_1

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture

mkdir MAF_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1

for i in $(seq 1 5)
do
  mkdir run$i
done

3.3.2 Generate script for Admixture. We will use 2000 bootstraps, and cv=10.


admixture europe.bed $name -j20 --cv=10 --B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads

Now generate the full script for dsq for MAF 1%

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
for run in $(seq 1 5); do
    for k in $(seq 1 25); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B2000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.bed $k | tee log_k$k.txt"
    done
done > dsq_MAF_1.txt  #  jobs

Generate the dsq batch files on the cluster for MAF 1%

module load dSQ/1.05
# make script
dsq \
--job-file dsq_MAF_1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_MAF_1 \
--batch-file admixture_MAF_1.sh 
sed -i '6 i #SBATCH --requeue' admixture_MAF_1.sh;

#check it
head admixture_MAF_1.sh

3.3.3 Run Admixture for MAF 1%

Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.

# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_MAF_1.sh
# Submitted batch job 18205974
# check status
dsqa -j 18205974  # if no errors, we can go ahead and submit all jobs

If it runs without problems, we can submit all jobs.

sbatch admixture_MAF_1.sh
# Submitted batch job 18211450 
# Retry 18211450

# check status
dsqa -j 18211450

3.3.4 Run autopsy once it is done, for MAF 1%

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1

dsqa -j 18211450 #all completed

Check if the seed is different for each run

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1

# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 200
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 5
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt 
## 125
## 5
## run1/log_k1.txt:Random seed: 1
## run1/log_k10.txt:Random seed: 1
## run1/log_k11.txt:Random seed: 1
## run1/log_k12.txt:Random seed: 1
## run1/log_k13.txt:Random seed: 1
## run1/log_k14.txt:Random seed: 1
## run1/log_k15.txt:Random seed: 1
## run1/log_k16.txt:Random seed: 1
## run1/log_k17.txt:Random seed: 1
## run1/log_k18.txt:Random seed: 1
## run1/log_k19.txt:Random seed: 1
## run1/log_k2.txt:Random seed: 1
## run1/log_k20.txt:Random seed: 1
## run1/log_k21.txt:Random seed: 1
## run1/log_k22.txt:Random seed: 1
## run1/log_k23.txt:Random seed: 1
## run1/log_k24.txt:Random seed: 1
## run1/log_k25.txt:Random seed: 1
## run1/log_k3.txt:Random seed: 1
## run1/log_k4.txt:Random seed: 1
## run1/log_k5.txt:Random seed: 1
## run1/log_k6.txt:Random seed: 1
## run1/log_k7.txt:Random seed: 1
## run1/log_k8.txt:Random seed: 1
## run1/log_k9.txt:Random seed: 1
## run2/log_k1.txt:Random seed: 2
## run2/log_k10.txt:Random seed: 2
## run2/log_k11.txt:Random seed: 2
## run2/log_k12.txt:Random seed: 2
## run2/log_k13.txt:Random seed: 2
## run2/log_k14.txt:Random seed: 2
## run2/log_k15.txt:Random seed: 2
## run2/log_k16.txt:Random seed: 2
## run2/log_k17.txt:Random seed: 2
## run2/log_k18.txt:Random seed: 2
## run2/log_k19.txt:Random seed: 2
## run2/log_k2.txt:Random seed: 2
## run2/log_k20.txt:Random seed: 2
## run2/log_k21.txt:Random seed: 2
## run2/log_k22.txt:Random seed: 2
## run2/log_k23.txt:Random seed: 2
## run2/log_k24.txt:Random seed: 2
## run2/log_k25.txt:Random seed: 2
## run2/log_k3.txt:Random seed: 2
## run2/log_k4.txt:Random seed: 2
## run2/log_k5.txt:Random seed: 2
## run2/log_k6.txt:Random seed: 2
## run2/log_k7.txt:Random seed: 2
## run2/log_k8.txt:Random seed: 2
## run2/log_k9.txt:Random seed: 2
## run3/log_k1.txt:Random seed: 3
## run3/log_k10.txt:Random seed: 3
## run3/log_k11.txt:Random seed: 3
## run3/log_k12.txt:Random seed: 3
## run3/log_k13.txt:Random seed: 3
## run3/log_k14.txt:Random seed: 3
## run3/log_k15.txt:Random seed: 3
## run3/log_k16.txt:Random seed: 3
## run3/log_k17.txt:Random seed: 3
## run3/log_k18.txt:Random seed: 3
## run3/log_k19.txt:Random seed: 3
## run3/log_k2.txt:Random seed: 3
## run3/log_k20.txt:Random seed: 3
## run3/log_k21.txt:Random seed: 3
## run3/log_k22.txt:Random seed: 3
## run3/log_k23.txt:Random seed: 3
## run3/log_k24.txt:Random seed: 3
## run3/log_k25.txt:Random seed: 3
## run3/log_k3.txt:Random seed: 3
## run3/log_k4.txt:Random seed: 3
## run3/log_k5.txt:Random seed: 3
## run3/log_k6.txt:Random seed: 3
## run3/log_k7.txt:Random seed: 3
## run3/log_k8.txt:Random seed: 3
## run3/log_k9.txt:Random seed: 3
## run4/log_k1.txt:Random seed: 4
## run4/log_k10.txt:Random seed: 4
## run4/log_k11.txt:Random seed: 4
## run4/log_k12.txt:Random seed: 4
## run4/log_k13.txt:Random seed: 4
## run4/log_k14.txt:Random seed: 4
## run4/log_k15.txt:Random seed: 4
## run4/log_k16.txt:Random seed: 4
## run4/log_k17.txt:Random seed: 4
## run4/log_k18.txt:Random seed: 4
## run4/log_k19.txt:Random seed: 4
## run4/log_k2.txt:Random seed: 4
## run4/log_k20.txt:Random seed: 4
## run4/log_k21.txt:Random seed: 4
## run4/log_k22.txt:Random seed: 4
## run4/log_k23.txt:Random seed: 4
## run4/log_k24.txt:Random seed: 4
## run4/log_k25.txt:Random seed: 4
## run4/log_k3.txt:Random seed: 4
## run4/log_k4.txt:Random seed: 4
## run4/log_k5.txt:Random seed: 4
## run4/log_k6.txt:Random seed: 4
## run4/log_k7.txt:Random seed: 4
## run4/log_k8.txt:Random seed: 4
## run4/log_k9.txt:Random seed: 4
## run5/log_k1.txt:Random seed: 5
## run5/log_k10.txt:Random seed: 5
## run5/log_k11.txt:Random seed: 5
## run5/log_k12.txt:Random seed: 5
## run5/log_k13.txt:Random seed: 5
## run5/log_k14.txt:Random seed: 5
## run5/log_k15.txt:Random seed: 5
## run5/log_k16.txt:Random seed: 5
## run5/log_k17.txt:Random seed: 5
## run5/log_k18.txt:Random seed: 5
## run5/log_k19.txt:Random seed: 5
## run5/log_k2.txt:Random seed: 5
## run5/log_k20.txt:Random seed: 5
## run5/log_k21.txt:Random seed: 5
## run5/log_k22.txt:Random seed: 5
## run5/log_k23.txt:Random seed: 5
## run5/log_k24.txt:Random seed: 5
## run5/log_k25.txt:Random seed: 5
## run5/log_k3.txt:Random seed: 5
## run5/log_k4.txt:Random seed: 5
## run5/log_k5.txt:Random seed: 5
## run5/log_k6.txt:Random seed: 5
## run5/log_k7.txt:Random seed: 5
## run5/log_k8.txt:Random seed: 5
## run5/log_k9.txt:Random seed: 5

3.4.5 Get the cross validation values for r<0.01

Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.

# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k10.txt:CV error (K=10): 0.70301
#run1/log_k11.txt:CV error (K=11): 0.70093
#run1/log_k12.txt:CV error (K=12): 0.69920
## run1/log_k1.txt:CV error (K=1): 0.76386
## run1/log_k10.txt:CV error (K=10): 0.70301
## run1/log_k11.txt:CV error (K=11): 0.70093
## run1/log_k12.txt:CV error (K=12): 0.69920
## run1/log_k13.txt:CV error (K=13): 0.69982
## run1/log_k14.txt:CV error (K=14): 0.69909
## run1/log_k15.txt:CV error (K=15): 0.69856
## run1/log_k16.txt:CV error (K=16): 0.70058
## run1/log_k17.txt:CV error (K=17): 0.70064
## run1/log_k18.txt:CV error (K=18): 0.70122

3.4.6 Parse the logs for MAF 1%

Make loop to get summary from all 5 runs for MAF 1%

# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
    echo $i 
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt

# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.70301
#1 11 0.70093

# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_MAF_1.txt
# we should have 125 lines in our file
wc -l cross_validation_MAF_1.txt
# 125

head cross_validation_MAF_1.txt
# it looks okay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
cp -r cross_validation_MAF_1.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/europe

Import the cross validation values into R

cross_val <-
  read_delim(
    here(
      "analyses", "europe", "cross_validation_MAF_1.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "iin"
  ) |>
  rename(
    run = 1,
    k   = 2,
    cv  = 3
  ) |>
  mutate(
    run = as.factor(
      run
    )
  )

Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation

##  ............................................................................
##  1000 bootstraps and 10 folds cv                                        ####
#
# make plot
cross_val |>
  ggplot() +
  geom_line(
    aes(
      x        = k,
      y        = cv,
      group    = run,
      color    = run,
      linetype = run
    ),
    linewidth  = .75
  ) +
  labs(
    x        = "K",
    y        = "Cross-validation error",
    title    = "Admixture Cross-validation for Europe (20,968 SNPs)",
    subtitle = "2000 bootstraps and cv = 10 ",
    caption  = "algorithm runs for choices of K ranging from 1 to 25"
  ) +
  scale_color_manual(
    values = c(
      "#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
    )
  ) +
  theme(
    panel.grid.major = element_line(
      linetype       = "dashed",
      linewidth      = 0.2,
    ),
    legend.title     = element_text(
        size           = 14,
        face           = "bold"
      ),
    panel.grid.minor = element_line(
      linetype       = "dashed",
      linewidth      = 0.2
    )
  )

Save the plot

ggsave(
  here(
    "analyses", "europe", "admixture_MAF_1_2000Bootstraps_CV_k1_k25.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

Now we can find the K with the lowest cross-validation error

##  1000 bootstraps and 10 folds cv                                         #
# to find the lowest cv error for each run
cross_val |>
  group_by(
    run
  ) |>
  summarize(
    LowestCVerror = min(
      cv,
      na.rm       = T
    )
  ) |>
  arrange(
    run
  ) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
  group_by(
    run
  ) |>
  slice(
    which.min(
      cv
    )
  ) |>
  arrange(
    run
  ) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
  left_join(
    cv_min,
    k_with_lowest_cv,
    by = "run"
  ) |>
    select(
      -cv
  )
#

# make table
ftbadmix <-
  flextable(
    admix_ks |>
      rename(
        Run               = 1,
        "Lowest CV error" = 2,
        K                 = 3,
      )
  )|>
  set_caption(
    as_paragraph(
      as_chunk(
        "Ks with lowest cross-validation errors for 5 runs of Admixture (20,968 SNPs)",
        props             = fp_text_default(
          font.fam        = "Cambria"
        )
      )
    ),
    word_stylena          = "Table Caption"
  )
#
# check the table
autofit(ftbadmix)
Ks with lowest cross-validation errors for 5 runs of Admixture (20,968 SNPs)

Run

Lowest CV error

K

1

0.69856

15

2

0.69812

16

3

0.69796

13

4

0.69820

15

5

0.69787

17

#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
  prop_section(
  page_size    = page_size(
    orient     = "portrait",
    width      = 8.3, 
    height     = 11.7
  ),
  type         = "continuous",
  page_margins = page_mar()
)
# save
# then you can open MS Word and make adjustments
ftbadmix |>
  autofit() |>
  save_as_docx(
    path       = here(
      "analyses", "europe", "admixture_europe_CV_k1_k25_MAF_1.docx"
    ),
    pr_section = sect_properties
  )

SNP Sets 1 and 2

1. Run fastStructure for European populations

1.1. SNP Set 2: Run Simple prior in fastStructure for file with ~47 SNPs (LD pruning with r2=0.1 dataset)

1.1.1. Check populations in file with ~47 SNPs (r2=0.1) dataset

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam | sort | uniq -c | awk '{print $2}' 
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## IMP
## ITB
## ITP
## ITR
## KER
## KRA
## MAL
## POL
## POP
## RAR
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPM
## SPS
## STS
## TIK
## TIR
## TRE
## TUA
## TUH

Count the total number of population

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam | sort | uniq -c | wc -l
## 41

1.1.2 Create a directory for each run.

#create directories for simple europe r_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
for i in $(seq -w 1 100)
do
  mkdir run$i
done
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
ls #should have 100 folders now

1.1.3. Create the dsq files

#create dsq file for simple europe r_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
module load dSQ/1.05 

for k in $(seq 1 40); do
    for run in $(seq -w 1 100); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1 --output=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run$run/simple --full --cv=10 --tol=10e-6"
    done
done > dsq_europe_r_1.txt

head dsq_europe_r_1.txt #check file

dsq --job-file dsq_europe_r_1.txt --output /dev/null --mail-type ALL --job-name fS.simple_europe_r_1 --batch-file simple_europe_r_1.sh

#Check the files to see if they have 4,000 jobs:
head simple_europe_r_1.sh

1.1.4 Run the jobs

Run a random job from our array to see if works

#test job for simple for simple europe r_1
sbatch --array=$((1 + $RANDOM % 3000)) simple_europe_r_1.sh

# check status
dsqa -j 9908113 #insert job number

#if it looks like its running ok, you can cancel it 
scancel 9908113 #insert job #

Submit all jobs.

sbatch simple_europe_r_1.sh
#Submitted batch job # 

#Check it
dsqa -j 9928902 #add job number here

1.1.5. Run autopsy once its done

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
module load dSQ/1.05

dsqa -j 9928902 -f dsq_europe_r_1.txt -s TIMEOUT > re-run_jobs1.txt 2> 9928902_report1.txt; head 9928902_report1.txt
head re-run_jobs1.txt #227 timedout

#create new batch script for any timeout jobs
dsq --job-file re-run_jobs1.txt --mem-per-cpu 5g --cpus-per-task=1 --mail-type END -t 12:00:00 --job-name fs.simple_europe_rerun1 --output /dev/null --batch-file simple_europe_rerun1.sh
head simple_europe_rerun1.sh

#Re-run timeout jobs
sbatch simple_europe_rerun1.sh #10235691
dsqa -j 10235691

#Run autopsy of reruns 
dsqa -j 10235691 -f re-run_jobs1.txt -s TIMEOUT > re-run_jobs3.txt 2> 10235691_report.txt
#all completed successfully

1.1.6. Find the optimal number of K.

It varies from run to run. You can estimate mean, mode and median values. Then choose one that makes sense.

#choose k for simple fastStructure run for simple europe r2=0.1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session

module load miniconda
conda activate fastStructure 

#First check files
ls run*/* | wc -l # should be 20000

# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
   #      --input=<file>

for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run*
do
    echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done

Get outputs in a file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1

cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run*
do
    echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' |  awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_europe_r0.1_40k.txt

head simple_europe_r0.1_40k.txt #check the file
#001 12 27
#002 20 27
#003 19 27

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
cp -r simple_europe_r0.1_40k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe #copy file 

1.1.7. Make a plot with the Ks.

Let’s import the data in R.

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
#library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(ggplot2)            # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
library(hrbrthemes)         # not really needed, personal preference for plots
#library(hrbrmisc)           # not really needed, personal preference for plots
# library(stringr)          # for strings operations
#library(ggstatsplot)        # statistics and plotting
library(flextable)          # create tables
library(officer)            # export office format
library(here)

import_fastStructure

# function to import our choosek.py data
import_fastStructure <- function(file) {
  # import as a tibble and set columns as integers
  dat <- read_delim(
    file,
    col_names      = TRUE,
    show_col_types = FALSE,
    col_types      = "iii"
  )

  # get columns we need and make it long for plotting
  dat <- dat |>
    gather(
      structure, likelihood, -run
    )

  # rename the columns by index
  dat <- dat |>
    rename(
      run   = 1,
      model = 2,
      k     = 3
    )
  return(dat)
}

Run import function for simple prior models for simple europe r2<0.1

choose_k_simple <- 
  import_fastStructure(
  here("output", "europe", "simple_europe_r0.1_40k.txt"
  )
)

Function to plot fastStructure choosek.py results for simple europe r2<0.1

# function to plot our choosek.py data
plot_fastStructure <- function(df) {
  df |>
    ggplot() +
    geom_line(
      aes(
        x              = run,
        y              = k,
        color          = model
      ),
      linewidth = 1
    ) +
    scale_colour_manual(
      "model",
      values = c(
        structure      = "#9d60ff",
        likelihood     = "#ffc08c"
      ),
      labels = c(
        "Maximizes \n Likelihood \n", "Explain \n Structure"
      )
    ) +
    labs(
      x                = "Run",
      y                = "K",
      title            = "fastStructure simple for Europe, r2<0.1 (47,484 SNPs)",
      caption          = "algorithm runs for choices of K ranging from 1 to 40"
    ) +
    theme(
      panel.grid.major = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      panel.grid.minor = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      legend.text = element_text(
        size = 12
      ),
      legend.title = element_text(
        size           = 14,
        face           = "bold"
      ),
      legend.position = "right"
    )
}

Use our function to plot k values

plot_fastStructure(
  choose_k_simple
) +
  labs(
    subtitle = "simple fastStructure for Europe, r2<0.1 (47,484 SNPs)"
  )

Save the plot

ggsave(
  here(
    "output", "europe", "figures", "fastStructure", "fastStructure_simple_k1_k40_europe_r0.1.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

1.1.8 Find mean K maximizing likelihood for simple europe r2<0.1

choose_k_likelihood <- subset(choose_k_simple, model != "structure")
mean(choose_k_likelihood$k) #14.92
## [1] 14.92
median(choose_k_likelihood$k) #15
## [1] 15

1.2 SNP Set 1: Run Simple prior in fastStructure for file with ~17 SNPs (LD pruning with r2=0.01 dataset)

1.2.1 Check populations in file with ~17 SNPs (r2=0.01) dataset

#get_summary_samples for r_0.01
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam | sort | uniq -c | awk '{print $2}' 
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## IMP
## ITB
## ITP
## ITR
## KER
## KRA
## MAL
## POL
## POP
## RAR
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPM
## SPS
## STS
## TIK
## TIR
## TRE
## TUA
## TUH

Count the total number of sampling localities for europe r_01

awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam | sort | uniq -c | wc -l
## 41

1.2.2 Create a directory for each run for europe r_01

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
for i in $(seq -w 1 100)
do
  mkdir run$i
done
ls #should have 100 folders now

1.2.3 Create the dsq files for europe r_01

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
module load dSQ/1.05 

for k in $(seq 1 40); do
    for run in $(seq -w 1 100); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01 --output=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run$run/simple --full --cv=10 --tol=10e-6"
    done
done > dsq_europe_r_01.txt

head dsq_europe_r_01.txt #check file

dsq --job-file dsq_europe_r_01.txt --output /dev/null --mail-type ALL --partition=ycga -t 24:00:00 --job-name fS.simple_europe_r_01 --batch-file simple_europe_r_01.sh

#Check the files to see if they have 4,000 jobs:
head simple_europe_r_01.sh

1.2.4 Run the jobs for europe r_01

Run a random job from our array to test if it works

sbatch --array=$((1 + $RANDOM % 1000)) simple_europe_r_01.sh

# check status
dsqa -j 9918767 #insert job number
#it worked

Submit all jobs

sbatch simple_europe_r_01.sh
#Submitted batch job # 

#Check it
dsqa -j 9920352 #add job number here

1.2.5 Run autopsy for europe r_01

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
module load dSQ/1.05

dsqa -j 9920352 -f dsq_europe_r_01.txt -s TIMEOUT > re-run_jobs1.txt 2> 9920352_report1.txt; head 9920352_report1.txt
head re-run_jobs1.txt #all completed

1.2.6. Find the optimal number of K.

you will see it varies from run to run. You can estimate mean, mode and median values. Then you choose one that makes sense.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session

module load miniconda
conda activate fastStructure 

#First check files
ls run*/* | wc -l # should be 20000

# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
   #      --input=<file>

for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run*
do
    echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done

Get outputs in a file

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01

cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run*
do
    echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' |  awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_europe_r0.01_40k.txt

head simple_europe_r0.01_40k.txt #check the file
#001 12 26
#002 38 27
#003 19 27

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
cp -r simple_europe_r0.01_40k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe #copy file 

1.2.7 Make a plot with the Ks for europe r_01

Run import function for simple prior models for europe r2<0.01

choose_k_simple <- 
  import_fastStructure(
  here("output", "europe", "simple_europe_r0.01_40k.txt"
  )
)

Function to plot fastStructure choosek.py results

# function to plot our choosek.py data
plot_fastStructure <- function(df) {
  df |>
    ggplot() +
    geom_line(
      aes(
        x              = run,
        y              = k,
        color          = model
      ),
      linewidth = 1
    ) +
    scale_colour_manual(
      "model",
      values = c(
        structure      = "#9d60ff",
        likelihood     = "#ffc08c"
      ),
      labels = c(
        "Maximizes \n Likelihood \n", "Explain \n Structure"
      )
    ) +
    labs(
      x                = "Run",
      y                = "K",
      title            = "fastStructure simple for Europe, r2<0.01 (17,028 SNPs)",
      caption          = "algorithm runs for choices of K ranging from 1 to 40"
    ) +
    theme(
      panel.grid.major = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      panel.grid.minor = element_line(
        linetype       = "dashed",
        linewidth      = 0.2
      ),
      legend.text = element_text(
        size = 12
      ),
      legend.title = element_text(
        size           = 14,
        face           = "bold"
      ),
      legend.position = "right"
    )
}

Use our function to plot k values

plot_fastStructure(
  choose_k_simple
) +
  labs(
    subtitle = "simple fastStructure for Europe, r2<0.01 (17,028 SNPs)"
  )

Save plot

ggsave(
  here(
    "output", "europe", "figures", "fastStructure", "fastStructure_simple_k1_k40_europe_r0.01.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

1.2.8 Find mean K maximizing likelihood

choose_k_likelihood <- subset(choose_k_simple, model != "structure")
mean(choose_k_likelihood$k) #15.1
## [1] 15.1
median(choose_k_likelihood$k) #15
## [1] 15

2. Run admixture

2.1. Load the libraries

# library(devtools)         # if you need to install some package from github or something else
library(tidyverse)          # many helpful things
library(here)               # to load data easily
library(colorout)           # colors are cool
library(dplyr)              # to manipulate data
library(flextable)          # create tables
# library(ggplot2)          # plots
# library(RColorBrewer)     # color pallets
# library(grid)             # set up plots
# library(scales)           # helps with plots axis scales
# library(showtext)         # helps annotating plots
# library(reticulate)       # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont)          # probably won't work on openOnDemand on the clusters
# library(hrbrthemes)       # not really needed, personal preference for plots
# library(hrbrmisc)         # not really needed, personal preference for plots
library(stringr)            # for strings operations
library(flextable)          # create tables
library(officer)            # export office format

2.2. Export path

# Add this in your script
export PATH=$PATH:/home/mkc54/admixture/releases/admixture_linux-1.3.0/

# Then call admixture from anywhere
admixture -h

Check the Admixture options.

admixture --help
****                   ADMIXTURE Version 1.3.0                  ****
****                    Copyright 2008-2015                     ****
****           David Alexander, Suyash Shringarpure,            ****
****                John  Novembre, Ken Lange                   ****
****                                                            ****
****                 Please cite our paper!                     ****
****   Information at www.genetics.ucla.edu/software/admixture  ****


  ADMIXTURE basic usage:  (see manual for complete reference)
    % admixture [options] inputFile K

  where:
    K is the number of populations; and
    inputFile may be:
      - a PLINK .bed file
      - a PLINK "12" coded .ped file

  Output will be in files inputBasename.K.Q, inputBasename.K.P

  General options:
    -jX          : do computation on X threads
    --seed=X     : use random seed X for initialization

  Algorithm options:
     -m=
    --method=[em|block]     : set method.  block is default

     -a=
    --acceleration=none   |
                   sqs<X> |
                   qn<X>      : set acceleration

  Convergence criteria:
    -C=X : set major convergence criterion (for point estimation)
    -c=x : set minor convergence criterion (for bootstrap and CV reestimates)

  Bootstrap standard errors:
    -B[X]      : do bootstrapping [with X replicates]

2.3 Generate script for Admixture. We will use 2000 bootstraps, and cv=10.


admixture europe.bed $name -j20 --cv=10 --B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads

2.4 SNP Set 2: Admixture analyses for LD purning r<0.1 dataset

2.4.1 Create directories for r<0.1.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture

mkdir r_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1

for i in $(seq 1 5)
do
  mkdir run$i
done

Generate script for Admixture. We will use 2000 bootstraps, and cv=10.


admixture europe.bed $name -j20 --cv=10 --B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads

2.4.2 Generate the full script for dsq for r_1

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
for run in $(seq 1 5); do
    for k in $(seq 1 40); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.bed $k | tee log_k$k.txt"
    done
done > dsqr_1.txt  #  jobs

Generate the dsq batch files on the cluster.

module load dSQ/1.05
# make script
dsq \
--job-file dsqr_1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europer1 \
--batch-file admixture_europer1.sh 
sed -i '6 i #SBATCH --requeue' admixture_europer1.sh;

#check it
head admixture_europer1.sh

2.4.3 Run Admixture for r2<0.1

Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.

# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_europer1.sh
# Submitted batch job 
# check status
dsqa -j 10387067  # if no errors, we can go ahead and submit all jobs

If it runs without problems, we can submit all jobs.

sbatch admixture_europer1.sh
# Submitted batch job 10763779

# check status
dsqa -j 10763779

2.4.4 Run autopsy once it is done, for admixture r_1

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1

dsqa -j 10763779 -f dsqr_1.txt -s TIMEOUT > rerun_jobs1.txt; wc -l rerun_jobs1.txt
#23 timedout; re submit the jobs 

dsq \
--job-file rerun_jobs1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europe_rr2 \
--batch-file admixture_europe_rr2.sh 
sed -i '6 i #SBATCH --requeue' admixture_europe_rr2.sh; #job 

sbatch admixture_europe_rr2.sh

# check status
dsqa -j 11805471

dsqa -j 11805471 -f rerun_jobs1.txt -s TIMEOUT > rerun_jobs2.txt; wc -l rerun_jobs2.txt
#7 timedout

dsq \
--job-file rerun_jobs2.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europe_rr2 \
--batch-file admixture_europe_rr2.sh 
sed -i '6 i #SBATCH --requeue' admixture_europe_rr2.sh;

sbatch admixture_europe_rr2.sh

dsqa -j 12274594

dsqa -j 12274594 -f rerun_jobs2.txt -s TIMEOUT > rerun_jobs3.txt; wc -l rerun_jobs3.txt

dsq \
--job-file rerun_jobs3.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europe_rr3 \
--batch-file admixture_europe_rr3.sh 
sed -i '6 i #SBATCH --requeue' admixture_europe_rr3.sh;

dsqa -j 12510727

Check if seed is different for each run

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1

# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 200
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 5
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt 

2.4.5 Get the cross validation values for r<0.1

Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file. Check log files

# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k10.txt:CV error (K=10): 0.74723
#run1/log_k11.txt:CV error (K=11): 0.74521
#run1/log_k12.txt:CV error (K=12): 0.74563
## run1/log_k1.txt:CV error (K=1): 0.73092
## run1/log_k10.txt:CV error (K=10): 0.67077
## run1/log_k11.txt:CV error (K=11): 0.66934
## run1/log_k12.txt:CV error (K=12): 0.66819
## run1/log_k13.txt:CV error (K=13): 0.66866
## run1/log_k14.txt:CV error (K=14): 0.66722
## run1/log_k15.txt:CV error (K=15): 0.66841
## run1/log_k16.txt:CV error (K=16): 0.66932
## run1/log_k17.txt:CV error (K=17): 0.67033
## run1/log_k18.txt:CV error (K=18): 0.67279

2.4.6 Parse the logs for r<0.1

Make loop to get summary from all 5 runs for r<0.01

# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
    echo $i 
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt

# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.74723
#1 11 0.74521
# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_r1.txt
# we should have 200 lines in our file
wc -l cross_validation_r1.txt
# 200 cross_validation.txt

head cross_validation_r1.txt
# it looks okay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
cp -r cross_validation_r1.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/europe

Import the cross validation values into R for r_1

# the files are in the output directory
##  ............................................................................
#cd /vast/palmer/scratch/caccone/mkc54/albo/admixture_native
#working directory /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/admixture_native

cross_val <-
  read_delim(
    here(
      "analyses", "europe", "cross_validation_r1.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "iin"
  ) |>
  rename(
    run = 1,
    k   = 2,
    cv  = 3
  ) |>
  mutate(
    run = as.factor(
      run
    )
  )

Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation for europe r_1 admixture

##  ............................................................................
##  1000 bootstraps and 10 folds cv                                        ####
#
# make plot
cross_val |>
  ggplot() +
  geom_line(
    aes(
      x        = k,
      y        = cv,
      group    = run,
      color    = run,
      linetype = run
    ),
    linewidth  = .75
  ) +
  labs(
    x        = "K",
    y        = "Cross-validation error",
    title    = "Admixture Cross-validation for Europe (47K SNPs)",
    subtitle = "2000 bootstraps and cv = 10 ",
    caption  = "algorithm runs for choices of K ranging from 1 to 40"
  ) +
  scale_color_manual(
    values = c(
      "#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
    )
  ) +
  theme(
    panel.grid.major = element_line(
      linetype       = "dashed",
      linewidth      = 0.2,
    ),
    legend.title     = element_text(
        size           = 14,
        face           = "bold"
      ),
    panel.grid.minor = element_line(
      linetype       = "dashed",
      linewidth      = 0.2
    )
  )

ggsave(
  here(
    "analyses", "europe", "admixture_r1_2000Bootstraps_CV_k1_k40.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

Now we can find the K with the lowest cross-validation error for europe admixture r1

##  1000 bootstraps and 10 folds cv                                         #
# to find the lowest cv error for each run
cross_val |>
  group_by(
    run
  ) |>
  summarize(
    LowestCVerror = min(
      cv,
      na.rm       = T
    )
  ) |>
  arrange(
    run
  ) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
  group_by(
    run
  ) |>
  slice(
    which.min(
      cv
    )
  ) |>
  arrange(
    run
  ) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
  left_join(
    cv_min,
    k_with_lowest_cv,
    by = "run"
  ) |>
    select(
      -cv
  )
#

# make table
ftbadmix <-
  flextable(
    admix_ks |>
      rename(
        Run               = 1,
        "Lowest CV error" = 2,
        K                 = 3,
      )
  )|>
  set_caption(
    as_paragraph(
      as_chunk(
        "Ks with lowest cross-validation errors for 5 runs of Admixture (47k SNPs)",
        props             = fp_text_default(
          font.fam        = "Cambria"
        )
      )
    ),
    word_stylena          = "Table Caption"
  )
#
# check the table
autofit(ftbadmix)
Ks with lowest cross-validation errors for 5 runs of Admixture (47k SNPs)

Run

Lowest CV error

K

1

0.66722

14

2

0.66783

16

3

0.66818

14

4

0.66658

15

5

0.66733

15

#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
  prop_section(
  page_size    = page_size(
    orient     = "portrait",
    width      = 8.3, 
    height     = 11.7
  ),
  type         = "continuous",
  page_margins = page_mar()
)
# save
# then you can open MS Word and make adjustments
ftbadmix |>
  autofit() |>
  save_as_docx(
    path       = here(
      "analyses", "europe", "admixture_europe_CV_k1_k40_r1.docx"
    ),
    pr_section = sect_properties
  )

2.5 SNP Set 1: Admixture analyses for LD pruning r<0.01 dataset

2.5.1 Create directories for r<0.01 for europe admixture analyses

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture

mkdir r_01
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01

for i in $(seq 1 5)
do
  mkdir run$i
done

2.5.2 Generate script for Admixture europe r_01

We will use 2000 bootstraps, and cv=10.

admixture europe.bed $name -j20 --cv=10 --B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads

Now generate the full script for dsq.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01

for run in $(seq 1 5); do
    for k in $(seq 1 40); do
        echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.bed $k | tee log_k$k.txt"
    done
done > dsqr_01.txt  #  jobs

Generate the dsq batch files on the cluster.

module load dSQ/1.05
# make script
dsq \
--job-file dsqr_01.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europer01 \
--batch-file admixture_europer01.sh 
sed -i '6 i #SBATCH --requeue' admixture_europer01.sh;

#check it
head admixture_europer01.sh

2.5.3 Run Admixture for r2<0.01

Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.

# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_europer01.sh
# Submitted batch job 
# check status
dsqa -j 10388171  # if no errors, we can go ahead and submit all jobs

If it runs without problems, we can submit all jobs.

sbatch --array=1-39 admixture_europer01.sh

# check status
dsqa -j 10768022

2.5.4 Run autopsy once it is done.

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01

dsqa -j 10389785 -f dsqr_01.txt -s TIMEOUT > rerun_jobs.txt; wc -l rerun_jobs.txt
# all completed!

For the re-runs

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01

dsqa -j 10768022 -f dsqr_01.txt -s TIMEOUT > rerun_jobs.txt; wc -l rerun_jobs.txt
# all completed!

check if the seed is different for each run

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01

# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 200
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 5
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt 
## 200
## 5
## run1/log_k1.txt:Random seed: 1
## run1/log_k10.txt:Random seed: 1
## run1/log_k11.txt:Random seed: 1
## run1/log_k12.txt:Random seed: 1
## run1/log_k13.txt:Random seed: 1
## run1/log_k14.txt:Random seed: 1
## run1/log_k15.txt:Random seed: 1
## run1/log_k16.txt:Random seed: 1
## run1/log_k17.txt:Random seed: 1
## run1/log_k18.txt:Random seed: 1
## run1/log_k19.txt:Random seed: 1
## run1/log_k2.txt:Random seed: 1
## run1/log_k20.txt:Random seed: 1
## run1/log_k21.txt:Random seed: 1
## run1/log_k22.txt:Random seed: 1
## run1/log_k23.txt:Random seed: 1
## run1/log_k24.txt:Random seed: 1
## run1/log_k25.txt:Random seed: 1
## run1/log_k26.txt:Random seed: 1
## run1/log_k27.txt:Random seed: 1
## run1/log_k28.txt:Random seed: 1
## run1/log_k29.txt:Random seed: 1
## run1/log_k3.txt:Random seed: 1
## run1/log_k30.txt:Random seed: 1
## run1/log_k31.txt:Random seed: 1
## run1/log_k32.txt:Random seed: 1
## run1/log_k33.txt:Random seed: 1
## run1/log_k34.txt:Random seed: 1
## run1/log_k35.txt:Random seed: 1
## run1/log_k36.txt:Random seed: 1
## run1/log_k37.txt:Random seed: 1
## run1/log_k38.txt:Random seed: 1
## run1/log_k39.txt:Random seed: 1
## run1/log_k4.txt:Random seed: 1
## run1/log_k40.txt:Random seed: 1
## run1/log_k5.txt:Random seed: 1
## run1/log_k6.txt:Random seed: 1
## run1/log_k7.txt:Random seed: 1
## run1/log_k8.txt:Random seed: 1
## run1/log_k9.txt:Random seed: 1
## run2/log_k1.txt:Random seed: 2
## run2/log_k10.txt:Random seed: 2
## run2/log_k11.txt:Random seed: 2
## run2/log_k12.txt:Random seed: 2
## run2/log_k13.txt:Random seed: 2
## run2/log_k14.txt:Random seed: 2
## run2/log_k15.txt:Random seed: 2
## run2/log_k16.txt:Random seed: 2
## run2/log_k17.txt:Random seed: 2
## run2/log_k18.txt:Random seed: 2
## run2/log_k19.txt:Random seed: 2
## run2/log_k2.txt:Random seed: 2
## run2/log_k20.txt:Random seed: 2
## run2/log_k21.txt:Random seed: 2
## run2/log_k22.txt:Random seed: 2
## run2/log_k23.txt:Random seed: 2
## run2/log_k24.txt:Random seed: 2
## run2/log_k25.txt:Random seed: 2
## run2/log_k26.txt:Random seed: 2
## run2/log_k27.txt:Random seed: 2
## run2/log_k28.txt:Random seed: 2
## run2/log_k29.txt:Random seed: 2
## run2/log_k3.txt:Random seed: 2
## run2/log_k30.txt:Random seed: 2
## run2/log_k31.txt:Random seed: 2
## run2/log_k32.txt:Random seed: 2
## run2/log_k33.txt:Random seed: 2
## run2/log_k34.txt:Random seed: 2
## run2/log_k35.txt:Random seed: 2
## run2/log_k36.txt:Random seed: 2
## run2/log_k37.txt:Random seed: 2
## run2/log_k38.txt:Random seed: 2
## run2/log_k39.txt:Random seed: 2
## run2/log_k4.txt:Random seed: 2
## run2/log_k40.txt:Random seed: 2
## run2/log_k5.txt:Random seed: 2
## run2/log_k6.txt:Random seed: 2
## run2/log_k7.txt:Random seed: 2
## run2/log_k8.txt:Random seed: 2
## run2/log_k9.txt:Random seed: 2
## run3/log_k1.txt:Random seed: 3
## run3/log_k10.txt:Random seed: 3
## run3/log_k11.txt:Random seed: 3
## run3/log_k12.txt:Random seed: 3
## run3/log_k13.txt:Random seed: 3
## run3/log_k14.txt:Random seed: 3
## run3/log_k15.txt:Random seed: 3
## run3/log_k16.txt:Random seed: 3
## run3/log_k17.txt:Random seed: 3
## run3/log_k18.txt:Random seed: 3
## run3/log_k19.txt:Random seed: 3
## run3/log_k2.txt:Random seed: 3
## run3/log_k20.txt:Random seed: 3
## run3/log_k21.txt:Random seed: 3
## run3/log_k22.txt:Random seed: 3
## run3/log_k23.txt:Random seed: 3
## run3/log_k24.txt:Random seed: 3
## run3/log_k25.txt:Random seed: 3
## run3/log_k26.txt:Random seed: 3
## run3/log_k27.txt:Random seed: 3
## run3/log_k28.txt:Random seed: 3
## run3/log_k29.txt:Random seed: 3
## run3/log_k3.txt:Random seed: 3
## run3/log_k30.txt:Random seed: 3
## run3/log_k31.txt:Random seed: 3
## run3/log_k32.txt:Random seed: 3
## run3/log_k33.txt:Random seed: 3
## run3/log_k34.txt:Random seed: 3
## run3/log_k35.txt:Random seed: 3
## run3/log_k36.txt:Random seed: 3
## run3/log_k37.txt:Random seed: 3
## run3/log_k38.txt:Random seed: 3
## run3/log_k39.txt:Random seed: 3
## run3/log_k4.txt:Random seed: 3
## run3/log_k40.txt:Random seed: 3
## run3/log_k5.txt:Random seed: 3
## run3/log_k6.txt:Random seed: 3
## run3/log_k7.txt:Random seed: 3
## run3/log_k8.txt:Random seed: 3
## run3/log_k9.txt:Random seed: 3
## run4/log_k1.txt:Random seed: 4
## run4/log_k10.txt:Random seed: 4
## run4/log_k11.txt:Random seed: 4
## run4/log_k12.txt:Random seed: 4
## run4/log_k13.txt:Random seed: 4
## run4/log_k14.txt:Random seed: 4
## run4/log_k15.txt:Random seed: 4
## run4/log_k16.txt:Random seed: 4
## run4/log_k17.txt:Random seed: 4
## run4/log_k18.txt:Random seed: 4
## run4/log_k19.txt:Random seed: 4
## run4/log_k2.txt:Random seed: 4
## run4/log_k20.txt:Random seed: 4
## run4/log_k21.txt:Random seed: 4
## run4/log_k22.txt:Random seed: 4
## run4/log_k23.txt:Random seed: 4
## run4/log_k24.txt:Random seed: 4
## run4/log_k25.txt:Random seed: 4
## run4/log_k26.txt:Random seed: 4
## run4/log_k27.txt:Random seed: 4
## run4/log_k28.txt:Random seed: 4
## run4/log_k29.txt:Random seed: 4
## run4/log_k3.txt:Random seed: 4
## run4/log_k30.txt:Random seed: 4
## run4/log_k31.txt:Random seed: 4
## run4/log_k32.txt:Random seed: 4
## run4/log_k33.txt:Random seed: 4
## run4/log_k34.txt:Random seed: 4
## run4/log_k35.txt:Random seed: 4
## run4/log_k36.txt:Random seed: 4
## run4/log_k37.txt:Random seed: 4
## run4/log_k38.txt:Random seed: 4
## run4/log_k39.txt:Random seed: 4
## run4/log_k4.txt:Random seed: 4
## run4/log_k40.txt:Random seed: 4
## run4/log_k5.txt:Random seed: 4
## run4/log_k6.txt:Random seed: 4
## run4/log_k7.txt:Random seed: 4
## run4/log_k8.txt:Random seed: 4
## run4/log_k9.txt:Random seed: 4
## run5/log_k1.txt:Random seed: 5
## run5/log_k10.txt:Random seed: 5
## run5/log_k11.txt:Random seed: 5
## run5/log_k12.txt:Random seed: 5
## run5/log_k13.txt:Random seed: 5
## run5/log_k14.txt:Random seed: 5
## run5/log_k15.txt:Random seed: 5
## run5/log_k16.txt:Random seed: 5
## run5/log_k17.txt:Random seed: 5
## run5/log_k18.txt:Random seed: 5
## run5/log_k19.txt:Random seed: 5
## run5/log_k2.txt:Random seed: 5
## run5/log_k20.txt:Random seed: 5
## run5/log_k21.txt:Random seed: 5
## run5/log_k22.txt:Random seed: 5
## run5/log_k23.txt:Random seed: 5
## run5/log_k24.txt:Random seed: 5
## run5/log_k25.txt:Random seed: 5
## run5/log_k26.txt:Random seed: 5
## run5/log_k27.txt:Random seed: 5
## run5/log_k28.txt:Random seed: 5
## run5/log_k29.txt:Random seed: 5
## run5/log_k3.txt:Random seed: 5
## run5/log_k30.txt:Random seed: 5
## run5/log_k31.txt:Random seed: 5
## run5/log_k32.txt:Random seed: 5
## run5/log_k33.txt:Random seed: 5
## run5/log_k34.txt:Random seed: 5
## run5/log_k35.txt:Random seed: 5
## run5/log_k36.txt:Random seed: 5
## run5/log_k37.txt:Random seed: 5
## run5/log_k38.txt:Random seed: 5
## run5/log_k39.txt:Random seed: 5
## run5/log_k4.txt:Random seed: 5
## run5/log_k40.txt:Random seed: 5
## run5/log_k5.txt:Random seed: 5
## run5/log_k6.txt:Random seed: 5
## run5/log_k7.txt:Random seed: 5
## run5/log_k8.txt:Random seed: 5
## run5/log_k9.txt:Random seed: 5

2.5.5 Get the cross validation values for r<0.01

Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.

Check log files

# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k10.txt:CV error (K=10): 0.74723
#run1/log_k11.txt:CV error (K=11): 0.74521
#run1/log_k12.txt:CV error (K=12): 0.74563

2.5.6 Parse the logs for r<0.01

Make loop to get summary from all 5 runs for r<0.01

# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
    echo $i 
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt

# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.74723
#1 11 0.74521
# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
    grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_r01.txt
# we should have 200 lines in our file
wc -l cross_validation_r01.txt
# 200 cross_validation.txt

head cross_validation_r01.txt
# it looks okay

cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
cp -r cross_validation_r01.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/europe

Import the cross validation values into R

# the files are in the output directory
##  ............................................................................
#cd /vast/palmer/scratch/caccone/mkc54/albo/admixture_native
#working directory /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/admixture_native

cross_val <-
  read_delim(
    here(
      "analyses", "europe", "cross_validation_r01.txt"
    ),
    col_names      = FALSE,
    show_col_types = FALSE,
    col_types      = "iin"
  ) |>
  rename(
    run = 1,
    k   = 2,
    cv  = 3
  ) |>
  mutate(
    run = as.factor(
      run
    )
  )

Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation

##  ............................................................................
##  1000 bootstraps and 10 folds cv                                        ####
#
# make plot
cross_val |>
  ggplot() +
  geom_line(
    aes(
      x        = k,
      y        = cv,
      group    = run,
      color    = run,
      linetype = run
    ),
    linewidth  = .75
  ) +
  labs(
    x        = "K",
    y        = "Cross-validation error",
    title    = "Admixture Cross-validation for Europe (17K SNPs)",
    subtitle = "2000 bootstraps and cv = 10 ",
    caption  = "algorithm runs for choices of K ranging from 1 to 40"
  ) +
  scale_color_manual(
    values = c(
      "#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
    )
  ) +
  theme(
    panel.grid.major = element_line(
      linetype       = "dashed",
      linewidth      = 0.2,
    ),
    legend.title     = element_text(
        size           = 14,
        face           = "bold"
      ),
    panel.grid.minor = element_line(
      linetype       = "dashed",
      linewidth      = 0.2
    )
  )

ggsave(
  here(
    "analyses", "europe", "admixture_r01_2000Bootstraps_CV_k1_k40.pdf"
  ),
  width  = 8,
  height = 6,
  units  = "in"
)

Now we can find the K with the lowest cross-validation error

##  1000 bootstraps and 10 folds cv                                         #
# to find the lowest cv error for each run
cross_val |>
  group_by(
    run
  ) |>
  summarize(
    LowestCVerror = min(
      cv,
      na.rm       = T
    )
  ) |>
  arrange(
    run
  ) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
  group_by(
    run
  ) |>
  slice(
    which.min(
      cv
    )
  ) |>
  arrange(
    run
  ) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
  left_join(
    cv_min,
    k_with_lowest_cv,
    by = "run"
  ) |>
    select(
      -cv
  )
#

# make table
ftbadmix <-
  flextable(
    admix_ks |>
      rename(
        Run               = 1,
        "Lowest CV error" = 2,
        K                 = 3,
      )
  )|>
  set_caption(
    as_paragraph(
      as_chunk(
        "Ks with lowest cross-validation errors for 5 runs of Admixture (17k SNPs)",
        props             = fp_text_default(
          font.fam        = "Cambria"
        )
      )
    ),
    word_stylena          = "Table Caption"
  )
#
# check the table
autofit(ftbadmix)
Ks with lowest cross-validation errors for 5 runs of Admixture (17k SNPs)

Run

Lowest CV error

K

1

0.74408

16

2

0.74251

14

3

0.74269

14

4

0.74246

13

5

0.74310

12

#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
  prop_section(
  page_size    = page_size(
    orient     = "portrait",
    width      = 8.3, 
    height     = 11.7
  ),
  type         = "continuous",
  page_margins = page_mar()
)
# save
# then you can open MS Word and make adjustments
ftbadmix |>
  autofit() |>
  save_as_docx(
    path       = here(
      "analyses", "europe", "admixture_europe_CV_k1_k40_r01.docx"
    ),
    pr_section = sect_properties
  )