# library(devtools) # if you need to install some package from github or something else
library(tidyverse) # many helpful things
library(here) # to load data easily
#library(colorout) # colors are cool
library(dplyr) # to manipulate data
library(ggplot2) # plots
# library(RColorBrewer) # color pallets
# library(grid) # set up plots
# library(scales) # helps with plots axis scales
# library(showtext) # helps annotating plots
# library(reticulate) # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont) # probably won't work on openOnDemand on the clusters
library(hrbrthemes) # not really needed, personal preference for plots
#library(hrbrmisc) # not really needed, personal preference for plots
# library(stringr) # for strings operations
#library(ggstatsplot) # statistics and plotting
library(flextable) # create tables
library(officer) # export office format
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam | sort | uniq -c | awk '{print $2}'
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## IMP
## ITB
## ITP
## ITR
## KER
## KRA
## MAL
## POL
## POP
## RAR
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPM
## SPS
## STS
## TIK
## TIR
## TRE
## TUA
## TUH
Count the total number of population
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam | sort | uniq -c | wc -l
## 41
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
module load dSQ/1.05
for k in $(seq 1 25); do
for run in $(seq -w 1 100); do
echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b --output=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run$run/simple --full --cv=10 --tol=10e-6"
done
done > dsq_europe_MAF1.txt
head dsq_europe_MAF1.txt #check file
dsq --job-file dsq_europe_MAF1.txt --output /dev/null --mail-type ALL -t 24:00:00 --partition=ycga --cpus-per-task=10 --job-name fS.simple_europe_MAF1 --batch-file simple_europe_MAF1.sh
#Check the files to see if they have 2500 jobs:
head simple_europe_MAF1.sh
Run a random job from our array to see if works
sbatch --array=$((1 + $RANDOM % 500)) simple_europe_MAF1.sh
# check status
dsqa -j 18205004 #insert job number
#if it looks like its running ok, you can cancel it
scancel 18205004 #insert job #
Submit all jobs.
you will see it varies from run to run. You can estimate mean, mode and median values. Then you choose one that makes sense.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session
module load miniconda
conda activate fastStructure
#First check files
ls run*/* | wc -l # should be 12500
# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
# --input=<file>
for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run*
do
echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done
#Model complexity that maximizes marginal likelihood =
#Model components used to explain structure in data =
Get outputs in a file
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run*
do
echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' | awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_europe_MAF_1_25k.txt
head simple_europe_MAF_1_25k.txt #check the file
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1
cp -r simple_europe_MAF_1_25k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe #copy file
Let’s import the data in R.
# library(devtools) # if you need to install some package from github or something else
library(tidyverse) # many helpful things
library(here) # to load data easily
#library(colorout) # colors are cool
library(dplyr) # to manipulate data
library(ggplot2) # plots
# library(RColorBrewer) # color pallets
# library(grid) # set up plots
# library(scales) # helps with plots axis scales
# library(showtext) # helps annotating plots
# library(reticulate) # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont) # probably won't work on openOnDemand on the clusters
library(hrbrthemes) # not really needed, personal preference for plots
#library(hrbrmisc) # not really needed, personal preference for plots
# library(stringr) # for strings operations
library(flextable) # create tables
library(officer) # export office format
library(here)
# function to import our choosek.py data
import_fastStructure <- function(file) {
# import as a tibble and set columns as integers
dat <- read_delim(
file,
col_names = TRUE,
show_col_types = FALSE,
col_types = "iii"
)
# get columns we need and make it long for plotting
dat <- dat |>
gather(
structure, likelihood, -run
)
# rename the columns by index
dat <- dat |>
rename(
run = 1,
model = 2,
k = 3
)
return(dat)
}
Run function to import MAF1% to R
## Group.1 k
## 1 likelihood 18.04
## 2 structure 3.68
## Group.1 k
## 1 likelihood 18
## 2 structure 1
mink <-aggregate(choose_k_simple[, 3], list(choose_k_simple$model), min)
maxk <-aggregate(choose_k_simple[, 3], list(choose_k_simple$model), max)
likelihood: 18.04 structure: 3.68
Function to plot fastStructure choosek.py results
# function to plot our choosek.py data
plot_fastStructure <- function(df) {
df |>
ggplot() +
geom_line(
aes(
x = run,
y = k,
color = model
),
linewidth = 1
) +
scale_colour_manual(
"model",
values = c(
structure = "#9d60ff",
likelihood = "#ffc08c"
),
labels = c(
"Maximizes \n Likelihood \n", "Explain \n Structure"
)
) +
labs(
x = "Run",
y = "K",
title = "fastStructure simple for Europe (20,968 SNPs)",
caption = "algorithm runs for choices of K ranging from 1 to 25"
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
),
legend.text = element_text(
size = 12
),
legend.title = element_text(
size = 14,
face = "bold"
),
legend.position = "right"
)
}
Use our function to plot k values
plot_fastStructure(
choose_k_simple
) +
labs(
subtitle = "simple fastStructure for Europe (20,968 SNPs)"
)
Using ggplot2 for individual admixtures
change to the correct run & Q matrix
fask15 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run072/simple.15.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(fask15)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.531 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 3.75e-1 2e-6 2 e-6 2e-6
## 2 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 9.90e-1 2e-6 2 e-6 2e-6
## 3 0.000002 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2 e-6 2e-6 2 e-6 2e-6
## 4 0.399 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 6.01e-1 2e-6 2 e-6 2e-6
## 5 0.578 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 4.22e-1 2e-6 2 e-6 2e-6
## 6 0.232 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 2e-6 5.39e-1 2e-6 5.01e-2 2e-6
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 0.531236 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.375485 2e-06
## 2 1066 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.990424 2e-06
## 3 1067 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.398970 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.601009 2e-06
## 5 1069 SOC 0.578240 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.421739 2e-06
## 6 1070 SOC 0.231965 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.538995 2e-06
## X11 X12 X13 X14 X15
## 1 0.000002 2e-06 0.093260 2e-06 2e-06
## 2 0.000002 2e-06 0.009555 2e-06 2e-06
## 3 0.000002 2e-06 0.999977 2e-06 2e-06
## 4 0.000002 2e-06 0.000002 2e-06 2e-06
## 5 0.000002 2e-06 0.000002 2e-06 2e-06
## 6 0.050104 2e-06 0.178918 2e-06 2e-06
Rename the columns
# Rename the columns starting from the third one
fask15 <- fask15 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
fask15$pop[fask15$pop == 'SCH'] <- 'STS'
# View the first few rows
head(fask15)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 0.531236 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.375485 2e-06
## 2 1066 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.990424 2e-06
## 3 1067 SOC 0.000002 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.000002 2e-06
## 4 1068 SOC 0.398970 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.601009 2e-06
## 5 1069 SOC 0.578240 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.421739 2e-06
## 6 1070 SOC 0.231965 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 2e-06 0.538995 2e-06
## v11 v12 v13 v14 v15
## 1 0.000002 2e-06 0.093260 2e-06 2e-06
## 2 0.000002 2e-06 0.009555 2e-06 2e-06
## 3 0.000002 2e-06 0.999977 2e-06 2e-06
## 4 0.000002 2e-06 0.000002 2e-06 2e-06
## 5 0.000002 2e-06 0.000002 2e-06 2e-06
## 6 0.050104 2e-06 0.178918 2e-06 2e-06
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1 Berlin, NJ USA 39.79081 -74.9291 Americas BER 2018
## 2 Columbus, OH USA 39.97170 -82.9071 Americas COL 2015
## 3 Palm Beach USA 26.70560 -80.0364 Americas PAL 2018
## 4 Houston, TX USA 29.75491 -95.3505 Americas HOU 2018
## 5 Los Angeles USA 34.05220 -118.2437 Americas LOS 2018
## 6 Manaus, AM Brazil -3.09161 -60.0325 Americas MAU 2017
## Region Subregion order order2 orderold
## 1 North America 1 NA 75
## 2 North America 2 NA 76
## 3 North America 3 NA 77
## 4 North America 4 NA 78
## 5 North America 5 NA 79
## 6 South America 6 NA 80
color_palette_15 <-
c("blue",
"#FF8C1A",
"purple4",
"#77DD77",
"orangered",
"magenta",
"#75FAFF",
"#F49AC2",
"#B22222",
"#B20CC9",
"green",
"#1E90FF",
"yellow2",
"green4",
"purple")
source(
here("my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- fask15 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_15[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n fastStructure inference with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_15) +
expand_limits(y = c(0, 1.5))
change to correct run & Q matrix
fask18 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run093/simple.18.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(fask18)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000001 1e-6 8.21e-1 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6
## 2 0.000001 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.00e+0
## 3 0.000001 1e-6 1 e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1 e-6
## 4 0.0182 1e-6 5.38e-1 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 4.44e-1
## 5 0.000001 1e-6 6.77e-1 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.93e-1
## 6 0.0559 1e-6 4.29e-1 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3.78e-1
## # ℹ 6 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 1065 SOC 0.000001 1e-06 0.820516 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 0.018240 1e-06 0.537642 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 0.000001 1e-06 0.677058 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 0.055852 1e-06 0.429497 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## X11 X12 X13 X14 X15 X16 X17 X18
## 1 1e-06 0.000001 0.000001 0.000001 0.179463 1e-06 1e-06 1e-06
## 2 1e-06 0.999977 0.000001 0.000001 0.000001 1e-06 1e-06 1e-06
## 3 1e-06 0.000001 0.000001 0.000001 0.999977 1e-06 1e-06 1e-06
## 4 1e-06 0.443530 0.000001 0.000569 0.000001 1e-06 1e-06 1e-06
## 5 1e-06 0.293325 0.029597 0.000001 0.000001 1e-06 1e-06 1e-06
## 6 1e-06 0.377923 0.004133 0.000001 0.132579 1e-06 1e-06 1e-06
Rename the columns
# Rename the columns starting from the third one
fask18 <- fask18 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
fask18$pop[fask18$pop == 'SCH'] <- 'STS'
# View the first few rows
head(fask18)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10
## 1 1065 SOC 0.000001 1e-06 0.820516 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 2 1066 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 3 1067 SOC 0.000001 1e-06 0.000001 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 4 1068 SOC 0.018240 1e-06 0.537642 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 5 1069 SOC 0.000001 1e-06 0.677058 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## 6 1070 SOC 0.055852 1e-06 0.429497 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06 1e-06
## v11 v12 v13 v14 v15 v16 v17 v18
## 1 1e-06 0.000001 0.000001 0.000001 0.179463 1e-06 1e-06 1e-06
## 2 1e-06 0.999977 0.000001 0.000001 0.000001 1e-06 1e-06 1e-06
## 3 1e-06 0.000001 0.000001 0.000001 0.999977 1e-06 1e-06 1e-06
## 4 1e-06 0.443530 0.000001 0.000569 0.000001 1e-06 1e-06 1e-06
## 5 1e-06 0.293325 0.029597 0.000001 0.000001 1e-06 1e-06 1e-06
## 6 1e-06 0.377923 0.004133 0.000001 0.132579 1e-06 1e-06 1e-06
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1 Berlin, NJ USA 39.79081 -74.9291 Americas BER 2018
## 2 Columbus, OH USA 39.97170 -82.9071 Americas COL 2015
## 3 Palm Beach USA 26.70560 -80.0364 Americas PAL 2018
## 4 Houston, TX USA 29.75491 -95.3505 Americas HOU 2018
## 5 Los Angeles USA 34.05220 -118.2437 Americas LOS 2018
## 6 Manaus, AM Brazil -3.09161 -60.0325 Americas MAU 2017
## Region Subregion order order2 orderold
## 1 North America 1 NA 75
## 2 North America 2 NA 76
## 3 North America 3 NA 77
## 4 North America 4 NA 78
## 5 North America 5 NA 79
## 6 South America 6 NA 80
color_palette_18 <-
c(
"#77DD77",
"#AE8333",
"#B22222",
"purple",
"green4",
"#75FAFF",
"orangered",
"#B20CC9",
"#1E90FF",
"blue",
"#FFFF99",
"purple4",
"navy",
"#FF8C1A",
"magenta",
"#F49AC2",
"green",
"yellow2"
)
source(
here("my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- fask18 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_18[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n fastStructure inference with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_18) +
expand_limits(y = c(0, 1.5))
change to correct run & Q matrix
fask5 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/MAF_1/run072/simple.5.meanQ"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(fask5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0533 0.000005 0.000005 0.945 0.00211
## 2 0.000005 0.000005 0.000005 1.00 0.000005
## 3 0.000005 0.000005 0.000005 1.00 0.000005
## 4 0.000005 0.000005 0.000005 0.943 0.0572
## 5 0.0400 0.000005 0.0794 0.881 0.000005
## 6 0.0991 0.000005 0.000005 0.901 0.000005
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 0.053349 5e-06 0.000005 0.944530 0.002111
## 2 1066 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 3 1067 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 4 1068 SOC 0.000005 5e-06 0.000005 0.942767 0.057218
## 5 1069 SOC 0.040044 5e-06 0.079437 0.880509 0.000005
## 6 1070 SOC 0.099121 5e-06 0.000005 0.900864 0.000005
Rename the columns
# Rename the columns starting from the third one
fask5 <- fask5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
fask5$pop[fask5$pop == 'SCH'] <- 'STS'
# View the first few rows
head(fask5)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 0.053349 5e-06 0.000005 0.944530 0.002111
## 2 1066 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 3 1067 SOC 0.000005 5e-06 0.000005 0.999980 0.000005
## 4 1068 SOC 0.000005 5e-06 0.000005 0.942767 0.057218
## 5 1069 SOC 0.040044 5e-06 0.079437 0.880509 0.000005
## 6 1070 SOC 0.099121 5e-06 0.000005 0.900864 0.000005
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1 Berlin, NJ USA 39.79081 -74.9291 Americas BER 2018
## 2 Columbus, OH USA 39.97170 -82.9071 Americas COL 2015
## 3 Palm Beach USA 26.70560 -80.0364 Americas PAL 2018
## 4 Houston, TX USA 29.75491 -95.3505 Americas HOU 2018
## 5 Los Angeles USA 34.05220 -118.2437 Americas LOS 2018
## 6 Manaus, AM Brazil -3.09161 -60.0325 Americas MAU 2017
## Region Subregion order order2 orderold
## 1 North America 1 NA 75
## 2 North America 2 NA 76
## 3 North America 3 NA 77
## 4 North America 4 NA 78
## 5 North America 5 NA 79
## 6 South America 6 NA 80
source(
here("my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- fask5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_5[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n fastStructure inference with 20,968 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_5) +
expand_limits(y = c(0, 1.5))
# library(devtools) # if you need to install some package from github or something else
library(tidyverse) # many helpful things
library(here) # to load data easily
library(colorout) # colors are cool
library(dplyr) # to manipulate data
library(flextable) # create tables
#library(ggstatsplot) # statistics and plotting
# library(ggplot2) # plots
# library(RColorBrewer) # color pallets
# library(grid) # set up plots
# library(scales) # helps with plots axis scales
# library(showtext) # helps annotating plots
# library(reticulate) # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont) # probably won't work on openOnDemand on the clusters
# library(hrbrthemes) # not really needed, personal preference for plots
# library(hrbrmisc) # not really needed, personal preference for plots
library(stringr) # for strings operations
library(flextable) # create tables
library(officer) # export office format
admixture europe.bed $name -j20 --cv=10 --B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads
Now generate the full script for dsq for MAF 1%
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
for run in $(seq 1 5); do
for k in $(seq 1 25); do
echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B2000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.bed $k | tee log_k$k.txt"
done
done > dsq_MAF_1.txt # jobs
Generate the dsq batch files on the cluster for MAF 1%
module load dSQ/1.05
# make script
dsq \
--job-file dsq_MAF_1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_MAF_1 \
--batch-file admixture_MAF_1.sh
sed -i '6 i #SBATCH --requeue' admixture_MAF_1.sh;
#check it
head admixture_MAF_1.sh
Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.
# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_MAF_1.sh
# Submitted batch job 18205974
# check status
dsqa -j 18205974 # if no errors, we can go ahead and submit all jobs
If it runs without problems, we can submit all jobs.
Check if the seed is different for each run
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 200
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 5
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt
## 125
## 5
## run1/log_k1.txt:Random seed: 1
## run1/log_k10.txt:Random seed: 1
## run1/log_k11.txt:Random seed: 1
## run1/log_k12.txt:Random seed: 1
## run1/log_k13.txt:Random seed: 1
## run1/log_k14.txt:Random seed: 1
## run1/log_k15.txt:Random seed: 1
## run1/log_k16.txt:Random seed: 1
## run1/log_k17.txt:Random seed: 1
## run1/log_k18.txt:Random seed: 1
## run1/log_k19.txt:Random seed: 1
## run1/log_k2.txt:Random seed: 1
## run1/log_k20.txt:Random seed: 1
## run1/log_k21.txt:Random seed: 1
## run1/log_k22.txt:Random seed: 1
## run1/log_k23.txt:Random seed: 1
## run1/log_k24.txt:Random seed: 1
## run1/log_k25.txt:Random seed: 1
## run1/log_k3.txt:Random seed: 1
## run1/log_k4.txt:Random seed: 1
## run1/log_k5.txt:Random seed: 1
## run1/log_k6.txt:Random seed: 1
## run1/log_k7.txt:Random seed: 1
## run1/log_k8.txt:Random seed: 1
## run1/log_k9.txt:Random seed: 1
## run2/log_k1.txt:Random seed: 2
## run2/log_k10.txt:Random seed: 2
## run2/log_k11.txt:Random seed: 2
## run2/log_k12.txt:Random seed: 2
## run2/log_k13.txt:Random seed: 2
## run2/log_k14.txt:Random seed: 2
## run2/log_k15.txt:Random seed: 2
## run2/log_k16.txt:Random seed: 2
## run2/log_k17.txt:Random seed: 2
## run2/log_k18.txt:Random seed: 2
## run2/log_k19.txt:Random seed: 2
## run2/log_k2.txt:Random seed: 2
## run2/log_k20.txt:Random seed: 2
## run2/log_k21.txt:Random seed: 2
## run2/log_k22.txt:Random seed: 2
## run2/log_k23.txt:Random seed: 2
## run2/log_k24.txt:Random seed: 2
## run2/log_k25.txt:Random seed: 2
## run2/log_k3.txt:Random seed: 2
## run2/log_k4.txt:Random seed: 2
## run2/log_k5.txt:Random seed: 2
## run2/log_k6.txt:Random seed: 2
## run2/log_k7.txt:Random seed: 2
## run2/log_k8.txt:Random seed: 2
## run2/log_k9.txt:Random seed: 2
## run3/log_k1.txt:Random seed: 3
## run3/log_k10.txt:Random seed: 3
## run3/log_k11.txt:Random seed: 3
## run3/log_k12.txt:Random seed: 3
## run3/log_k13.txt:Random seed: 3
## run3/log_k14.txt:Random seed: 3
## run3/log_k15.txt:Random seed: 3
## run3/log_k16.txt:Random seed: 3
## run3/log_k17.txt:Random seed: 3
## run3/log_k18.txt:Random seed: 3
## run3/log_k19.txt:Random seed: 3
## run3/log_k2.txt:Random seed: 3
## run3/log_k20.txt:Random seed: 3
## run3/log_k21.txt:Random seed: 3
## run3/log_k22.txt:Random seed: 3
## run3/log_k23.txt:Random seed: 3
## run3/log_k24.txt:Random seed: 3
## run3/log_k25.txt:Random seed: 3
## run3/log_k3.txt:Random seed: 3
## run3/log_k4.txt:Random seed: 3
## run3/log_k5.txt:Random seed: 3
## run3/log_k6.txt:Random seed: 3
## run3/log_k7.txt:Random seed: 3
## run3/log_k8.txt:Random seed: 3
## run3/log_k9.txt:Random seed: 3
## run4/log_k1.txt:Random seed: 4
## run4/log_k10.txt:Random seed: 4
## run4/log_k11.txt:Random seed: 4
## run4/log_k12.txt:Random seed: 4
## run4/log_k13.txt:Random seed: 4
## run4/log_k14.txt:Random seed: 4
## run4/log_k15.txt:Random seed: 4
## run4/log_k16.txt:Random seed: 4
## run4/log_k17.txt:Random seed: 4
## run4/log_k18.txt:Random seed: 4
## run4/log_k19.txt:Random seed: 4
## run4/log_k2.txt:Random seed: 4
## run4/log_k20.txt:Random seed: 4
## run4/log_k21.txt:Random seed: 4
## run4/log_k22.txt:Random seed: 4
## run4/log_k23.txt:Random seed: 4
## run4/log_k24.txt:Random seed: 4
## run4/log_k25.txt:Random seed: 4
## run4/log_k3.txt:Random seed: 4
## run4/log_k4.txt:Random seed: 4
## run4/log_k5.txt:Random seed: 4
## run4/log_k6.txt:Random seed: 4
## run4/log_k7.txt:Random seed: 4
## run4/log_k8.txt:Random seed: 4
## run4/log_k9.txt:Random seed: 4
## run5/log_k1.txt:Random seed: 5
## run5/log_k10.txt:Random seed: 5
## run5/log_k11.txt:Random seed: 5
## run5/log_k12.txt:Random seed: 5
## run5/log_k13.txt:Random seed: 5
## run5/log_k14.txt:Random seed: 5
## run5/log_k15.txt:Random seed: 5
## run5/log_k16.txt:Random seed: 5
## run5/log_k17.txt:Random seed: 5
## run5/log_k18.txt:Random seed: 5
## run5/log_k19.txt:Random seed: 5
## run5/log_k2.txt:Random seed: 5
## run5/log_k20.txt:Random seed: 5
## run5/log_k21.txt:Random seed: 5
## run5/log_k22.txt:Random seed: 5
## run5/log_k23.txt:Random seed: 5
## run5/log_k24.txt:Random seed: 5
## run5/log_k25.txt:Random seed: 5
## run5/log_k3.txt:Random seed: 5
## run5/log_k4.txt:Random seed: 5
## run5/log_k5.txt:Random seed: 5
## run5/log_k6.txt:Random seed: 5
## run5/log_k7.txt:Random seed: 5
## run5/log_k8.txt:Random seed: 5
## run5/log_k9.txt:Random seed: 5
Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.
# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k10.txt:CV error (K=10): 0.70301
#run1/log_k11.txt:CV error (K=11): 0.70093
#run1/log_k12.txt:CV error (K=12): 0.69920
## run1/log_k1.txt:CV error (K=1): 0.76386
## run1/log_k10.txt:CV error (K=10): 0.70301
## run1/log_k11.txt:CV error (K=11): 0.70093
## run1/log_k12.txt:CV error (K=12): 0.69920
## run1/log_k13.txt:CV error (K=13): 0.69982
## run1/log_k14.txt:CV error (K=14): 0.69909
## run1/log_k15.txt:CV error (K=15): 0.69856
## run1/log_k16.txt:CV error (K=16): 0.70058
## run1/log_k17.txt:CV error (K=17): 0.70064
## run1/log_k18.txt:CV error (K=18): 0.70122
Make loop to get summary from all 5 runs for MAF 1%
# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
echo $i
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt
# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.70301
#1 11 0.70093
# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_MAF_1.txt
# we should have 125 lines in our file
wc -l cross_validation_MAF_1.txt
# 125
head cross_validation_MAF_1.txt
# it looks okay
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/MAF_1
cp -r cross_validation_MAF_1.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/europe
Import the cross validation values into R
cross_val <-
read_delim(
here(
"analyses", "europe", "cross_validation_MAF_1.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "iin"
) |>
rename(
run = 1,
k = 2,
cv = 3
) |>
mutate(
run = as.factor(
run
)
)
Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation
## ............................................................................
## 1000 bootstraps and 10 folds cv ####
#
# make plot
cross_val |>
ggplot() +
geom_line(
aes(
x = k,
y = cv,
group = run,
color = run,
linetype = run
),
linewidth = .75
) +
labs(
x = "K",
y = "Cross-validation error",
title = "Admixture Cross-validation for Europe (20,968 SNPs)",
subtitle = "2000 bootstraps and cv = 10 ",
caption = "algorithm runs for choices of K ranging from 1 to 25"
) +
scale_color_manual(
values = c(
"#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
)
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2,
),
legend.title = element_text(
size = 14,
face = "bold"
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
)
)
Save the plot
ggsave(
here(
"analyses", "europe", "admixture_MAF_1_2000Bootstraps_CV_k1_k25.pdf"
),
width = 8,
height = 6,
units = "in"
)
Now we can find the K with the lowest cross-validation error
## 1000 bootstraps and 10 folds cv #
# to find the lowest cv error for each run
cross_val |>
group_by(
run
) |>
summarize(
LowestCVerror = min(
cv,
na.rm = T
)
) |>
arrange(
run
) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
group_by(
run
) |>
slice(
which.min(
cv
)
) |>
arrange(
run
) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
left_join(
cv_min,
k_with_lowest_cv,
by = "run"
) |>
select(
-cv
)
#
# make table
ftbadmix <-
flextable(
admix_ks |>
rename(
Run = 1,
"Lowest CV error" = 2,
K = 3,
)
)|>
set_caption(
as_paragraph(
as_chunk(
"Ks with lowest cross-validation errors for 5 runs of Admixture (20,968 SNPs)",
props = fp_text_default(
font.fam = "Cambria"
)
)
),
word_stylena = "Table Caption"
)
#
# check the table
autofit(ftbadmix)
Run | Lowest CV error | K |
---|---|---|
1 | 0.69856 | 15 |
2 | 0.69812 | 16 |
3 | 0.69796 | 13 |
4 | 0.69820 | 15 |
5 | 0.69787 | 17 |
#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
prop_section(
page_size = page_size(
orient = "portrait",
width = 8.3,
height = 11.7
),
type = "continuous",
page_margins = page_mar()
)
# save
# then you can open MS Word and make adjustments
ftbadmix |>
autofit() |>
save_as_docx(
path = here(
"analyses", "europe", "admixture_europe_CV_k1_k25_MAF_1.docx"
),
pr_section = sect_properties
)
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam | sort | uniq -c | awk '{print $2}'
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## IMP
## ITB
## ITP
## ITR
## KER
## KRA
## MAL
## POL
## POP
## RAR
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPM
## SPS
## STS
## TIK
## TIR
## TRE
## TUA
## TUH
Count the total number of population
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam | sort | uniq -c | wc -l
## 41
#create dsq file for simple europe r_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
module load dSQ/1.05
for k in $(seq 1 40); do
for run in $(seq -w 1 100); do
echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1 --output=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run$run/simple --full --cv=10 --tol=10e-6"
done
done > dsq_europe_r_1.txt
head dsq_europe_r_1.txt #check file
dsq --job-file dsq_europe_r_1.txt --output /dev/null --mail-type ALL --job-name fS.simple_europe_r_1 --batch-file simple_europe_r_1.sh
#Check the files to see if they have 4,000 jobs:
head simple_europe_r_1.sh
Run a random job from our array to see if works
#test job for simple for simple europe r_1
sbatch --array=$((1 + $RANDOM % 3000)) simple_europe_r_1.sh
# check status
dsqa -j 9908113 #insert job number
#if it looks like its running ok, you can cancel it
scancel 9908113 #insert job #
Submit all jobs.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
module load dSQ/1.05
dsqa -j 9928902 -f dsq_europe_r_1.txt -s TIMEOUT > re-run_jobs1.txt 2> 9928902_report1.txt; head 9928902_report1.txt
head re-run_jobs1.txt #227 timedout
#create new batch script for any timeout jobs
dsq --job-file re-run_jobs1.txt --mem-per-cpu 5g --cpus-per-task=1 --mail-type END -t 12:00:00 --job-name fs.simple_europe_rerun1 --output /dev/null --batch-file simple_europe_rerun1.sh
head simple_europe_rerun1.sh
#Re-run timeout jobs
sbatch simple_europe_rerun1.sh #10235691
dsqa -j 10235691
#Run autopsy of reruns
dsqa -j 10235691 -f re-run_jobs1.txt -s TIMEOUT > re-run_jobs3.txt 2> 10235691_report.txt
#all completed successfully
It varies from run to run. You can estimate mean, mode and median values. Then choose one that makes sense.
#choose k for simple fastStructure run for simple europe r2=0.1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session
module load miniconda
conda activate fastStructure
#First check files
ls run*/* | wc -l # should be 20000
# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
# --input=<file>
for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run*
do
echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done
Get outputs in a file
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run*
do
echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' | awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_europe_r0.1_40k.txt
head simple_europe_r0.1_40k.txt #check the file
#001 12 27
#002 20 27
#003 19 27
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1
cp -r simple_europe_r0.1_40k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe #copy file
Let’s import the data in R.
# library(devtools) # if you need to install some package from github or something else
library(tidyverse) # many helpful things
library(here) # to load data easily
#library(colorout) # colors are cool
library(dplyr) # to manipulate data
library(ggplot2) # plots
# library(RColorBrewer) # color pallets
# library(grid) # set up plots
# library(scales) # helps with plots axis scales
# library(showtext) # helps annotating plots
# library(reticulate) # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont) # probably won't work on openOnDemand on the clusters
library(hrbrthemes) # not really needed, personal preference for plots
#library(hrbrmisc) # not really needed, personal preference for plots
# library(stringr) # for strings operations
#library(ggstatsplot) # statistics and plotting
library(flextable) # create tables
library(officer) # export office format
library(here)
import_fastStructure
# function to import our choosek.py data
import_fastStructure <- function(file) {
# import as a tibble and set columns as integers
dat <- read_delim(
file,
col_names = TRUE,
show_col_types = FALSE,
col_types = "iii"
)
# get columns we need and make it long for plotting
dat <- dat |>
gather(
structure, likelihood, -run
)
# rename the columns by index
dat <- dat |>
rename(
run = 1,
model = 2,
k = 3
)
return(dat)
}
Run import function for simple prior models for simple europe r2<0.1
Function to plot fastStructure choosek.py results for simple europe r2<0.1
# function to plot our choosek.py data
plot_fastStructure <- function(df) {
df |>
ggplot() +
geom_line(
aes(
x = run,
y = k,
color = model
),
linewidth = 1
) +
scale_colour_manual(
"model",
values = c(
structure = "#9d60ff",
likelihood = "#ffc08c"
),
labels = c(
"Maximizes \n Likelihood \n", "Explain \n Structure"
)
) +
labs(
x = "Run",
y = "K",
title = "fastStructure simple for Europe, r2<0.1 (47,484 SNPs)",
caption = "algorithm runs for choices of K ranging from 1 to 40"
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
),
legend.text = element_text(
size = 12
),
legend.title = element_text(
size = 14,
face = "bold"
),
legend.position = "right"
)
}
Use our function to plot k values
plot_fastStructure(
choose_k_simple
) +
labs(
subtitle = "simple fastStructure for Europe, r2<0.1 (47,484 SNPs)"
)
Save the plot
#get_summary_samples for r_0.01
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam | sort | uniq -c | awk '{print $2}'
## ALD
## ALU
## ALV
## ARM
## BAR
## BRE
## BUL
## CES
## CRO
## DES
## FRS
## GES
## GRA
## GRC
## IMP
## ITB
## ITP
## ITR
## KER
## KRA
## MAL
## POL
## POP
## RAR
## ROM
## ROS
## SER
## SEV
## SIC
## SLO
## SOC
## SPB
## SPC
## SPM
## SPS
## STS
## TIK
## TIR
## TRE
## TUA
## TUH
Count the total number of sampling localities for europe r_01
awk '{print $1}' /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam | sort | uniq -c | wc -l
## 41
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
module load dSQ/1.05
for k in $(seq 1 40); do
for run in $(seq -w 1 100); do
echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01; source /vast/palmer/apps/software/miniconda/23.3.1/etc/profile.d/conda.sh; module load miniconda/4.12.0; conda activate fastStructure; export PYTHONPATH=/home/mkc54/fastStructure/; python -m structure -K $k --input=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01 --output=/gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run$run/simple --full --cv=10 --tol=10e-6"
done
done > dsq_europe_r_01.txt
head dsq_europe_r_01.txt #check file
dsq --job-file dsq_europe_r_01.txt --output /dev/null --mail-type ALL --partition=ycga -t 24:00:00 --job-name fS.simple_europe_r_01 --batch-file simple_europe_r_01.sh
#Check the files to see if they have 4,000 jobs:
head simple_europe_r_01.sh
Run a random job from our array to test if it works
sbatch --array=$((1 + $RANDOM % 1000)) simple_europe_r_01.sh
# check status
dsqa -j 9918767 #insert job number
#it worked
Submit all jobs
you will see it varies from run to run. You can estimate mean, mode and median values. Then you choose one that makes sense.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
srun -p ycga --pty -N 1 -n 1 -c 4 bash #start interactive session
module load miniconda
conda activate fastStructure
#First check files
ls run*/* | wc -l # should be 20000
# chose k for manuscript data
export PYTHONPATH=/home/mkc54/fastStructure
python -m chooseK #see how we can use chooseK
#python /vast/palmer/home.mccleary/mkc54/fastStructure/chooseK.py
# --input=<file>
for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01/run*
do
echo $i | sed 's@.*/@@'; python -m chooseK --input=$i/simple | grep 'likelihood\|structure'
done
Get outputs in a file
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
cat <(echo 'run likelihood structure') \
<(for i in /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_1/run*
do
echo $i | sed 's@.*/@@' | sed 's/run/run /' ; python -m chooseK --input=$i/simple | grep 'likelihood' | awk '{print $6, $8}'; python -m chooseK --input=$i/simple | grep 'structure' | awk '{print $6, $10}'
done | xargs -n6 | awk '{print $2, $4, $6}') > simple_europe_r0.01_40k.txt
head simple_europe_r0.01_40k.txt #check the file
#001 12 26
#002 38 27
#003 19 27
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/fastStructure/r_01
cp -r simple_europe_r0.01_40k.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe #copy file
Run import function for simple prior models for europe r2<0.01
Function to plot fastStructure choosek.py results
# function to plot our choosek.py data
plot_fastStructure <- function(df) {
df |>
ggplot() +
geom_line(
aes(
x = run,
y = k,
color = model
),
linewidth = 1
) +
scale_colour_manual(
"model",
values = c(
structure = "#9d60ff",
likelihood = "#ffc08c"
),
labels = c(
"Maximizes \n Likelihood \n", "Explain \n Structure"
)
) +
labs(
x = "Run",
y = "K",
title = "fastStructure simple for Europe, r2<0.01 (17,028 SNPs)",
caption = "algorithm runs for choices of K ranging from 1 to 40"
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
),
legend.text = element_text(
size = 12
),
legend.title = element_text(
size = 14,
face = "bold"
),
legend.position = "right"
)
}
Use our function to plot k values
plot_fastStructure(
choose_k_simple
) +
labs(
subtitle = "simple fastStructure for Europe, r2<0.01 (17,028 SNPs)"
)
Save plot
# library(devtools) # if you need to install some package from github or something else
library(tidyverse) # many helpful things
library(here) # to load data easily
library(colorout) # colors are cool
library(dplyr) # to manipulate data
library(flextable) # create tables
# library(ggplot2) # plots
# library(RColorBrewer) # color pallets
# library(grid) # set up plots
# library(scales) # helps with plots axis scales
# library(showtext) # helps annotating plots
# library(reticulate) # to learn more about reticulate https://rstudio.github.io/reticulate/articles/r_markdown.html#engine-setup
library(extrafont) # probably won't work on openOnDemand on the clusters
# library(hrbrthemes) # not really needed, personal preference for plots
# library(hrbrmisc) # not really needed, personal preference for plots
library(stringr) # for strings operations
library(flextable) # create tables
library(officer) # export office format
# Add this in your script
export PATH=$PATH:/home/mkc54/admixture/releases/admixture_linux-1.3.0/
# Then call admixture from anywhere
admixture -h
Check the Admixture options.
admixture --help
**** ADMIXTURE Version 1.3.0 ****
**** Copyright 2008-2015 ****
**** David Alexander, Suyash Shringarpure, ****
**** John Novembre, Ken Lange ****
**** ****
**** Please cite our paper! ****
**** Information at www.genetics.ucla.edu/software/admixture ****
ADMIXTURE basic usage: (see manual for complete reference)
% admixture [options] inputFile K
where:
K is the number of populations; and
inputFile may be:
- a PLINK .bed file
- a PLINK "12" coded .ped file
Output will be in files inputBasename.K.Q, inputBasename.K.P
General options:
-jX : do computation on X threads
--seed=X : use random seed X for initialization
Algorithm options:
-m=
--method=[em|block] : set method. block is default
-a=
--acceleration=none |
sqs<X> |
qn<X> : set acceleration
Convergence criteria:
-C=X : set major convergence criterion (for point estimation)
-c=x : set minor convergence criterion (for bootstrap and CV reestimates)
Bootstrap standard errors:
-B[X] : do bootstrapping [with X replicates]
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture
mkdir r_1
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
for i in $(seq 1 5)
do
mkdir run$i
done
Generate script for Admixture. We will use 2000 bootstraps, and cv=10.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
for run in $(seq 1 5); do
for k in $(seq 1 40); do
echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.bed $k | tee log_k$k.txt"
done
done > dsqr_1.txt # jobs
Generate the dsq batch files on the cluster.
module load dSQ/1.05
# make script
dsq \
--job-file dsqr_1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europer1 \
--batch-file admixture_europer1.sh
sed -i '6 i #SBATCH --requeue' admixture_europer1.sh;
#check it
head admixture_europer1.sh
Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.
# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_europer1.sh
# Submitted batch job
# check status
dsqa -j 10387067 # if no errors, we can go ahead and submit all jobs
If it runs without problems, we can submit all jobs.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
dsqa -j 10763779 -f dsqr_1.txt -s TIMEOUT > rerun_jobs1.txt; wc -l rerun_jobs1.txt
#23 timedout; re submit the jobs
dsq \
--job-file rerun_jobs1.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europe_rr2 \
--batch-file admixture_europe_rr2.sh
sed -i '6 i #SBATCH --requeue' admixture_europe_rr2.sh; #job
sbatch admixture_europe_rr2.sh
# check status
dsqa -j 11805471
dsqa -j 11805471 -f rerun_jobs1.txt -s TIMEOUT > rerun_jobs2.txt; wc -l rerun_jobs2.txt
#7 timedout
dsq \
--job-file rerun_jobs2.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europe_rr2 \
--batch-file admixture_europe_rr2.sh
sed -i '6 i #SBATCH --requeue' admixture_europe_rr2.sh;
sbatch admixture_europe_rr2.sh
dsqa -j 12274594
dsqa -j 12274594 -f rerun_jobs2.txt -s TIMEOUT > rerun_jobs3.txt; wc -l rerun_jobs3.txt
dsq \
--job-file rerun_jobs3.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europe_rr3 \
--batch-file admixture_europe_rr3.sh
sed -i '6 i #SBATCH --requeue' admixture_europe_rr3.sh;
dsqa -j 12510727
Check if seed is different for each run
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 200
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 5
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt
Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file. Check log files
# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k10.txt:CV error (K=10): 0.74723
#run1/log_k11.txt:CV error (K=11): 0.74521
#run1/log_k12.txt:CV error (K=12): 0.74563
## run1/log_k1.txt:CV error (K=1): 0.73092
## run1/log_k10.txt:CV error (K=10): 0.67077
## run1/log_k11.txt:CV error (K=11): 0.66934
## run1/log_k12.txt:CV error (K=12): 0.66819
## run1/log_k13.txt:CV error (K=13): 0.66866
## run1/log_k14.txt:CV error (K=14): 0.66722
## run1/log_k15.txt:CV error (K=15): 0.66841
## run1/log_k16.txt:CV error (K=16): 0.66932
## run1/log_k17.txt:CV error (K=17): 0.67033
## run1/log_k18.txt:CV error (K=18): 0.67279
Make loop to get summary from all 5 runs for r<0.01
# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
echo $i
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt
# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.74723
#1 11 0.74521
# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_r1.txt
# we should have 200 lines in our file
wc -l cross_validation_r1.txt
# 200 cross_validation.txt
head cross_validation_r1.txt
# it looks okay
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_1
cp -r cross_validation_r1.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/europe
Import the cross validation values into R for r_1
# the files are in the output directory
## ............................................................................
#cd /vast/palmer/scratch/caccone/mkc54/albo/admixture_native
#working directory /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/admixture_native
cross_val <-
read_delim(
here(
"analyses", "europe", "cross_validation_r1.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "iin"
) |>
rename(
run = 1,
k = 2,
cv = 3
) |>
mutate(
run = as.factor(
run
)
)
Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation for europe r_1 admixture
## ............................................................................
## 1000 bootstraps and 10 folds cv ####
#
# make plot
cross_val |>
ggplot() +
geom_line(
aes(
x = k,
y = cv,
group = run,
color = run,
linetype = run
),
linewidth = .75
) +
labs(
x = "K",
y = "Cross-validation error",
title = "Admixture Cross-validation for Europe (47K SNPs)",
subtitle = "2000 bootstraps and cv = 10 ",
caption = "algorithm runs for choices of K ranging from 1 to 40"
) +
scale_color_manual(
values = c(
"#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
)
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2,
),
legend.title = element_text(
size = 14,
face = "bold"
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
)
)
ggsave(
here(
"analyses", "europe", "admixture_r1_2000Bootstraps_CV_k1_k40.pdf"
),
width = 8,
height = 6,
units = "in"
)
Now we can find the K with the lowest cross-validation error for europe admixture r1
## 1000 bootstraps and 10 folds cv #
# to find the lowest cv error for each run
cross_val |>
group_by(
run
) |>
summarize(
LowestCVerror = min(
cv,
na.rm = T
)
) |>
arrange(
run
) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
group_by(
run
) |>
slice(
which.min(
cv
)
) |>
arrange(
run
) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
left_join(
cv_min,
k_with_lowest_cv,
by = "run"
) |>
select(
-cv
)
#
# make table
ftbadmix <-
flextable(
admix_ks |>
rename(
Run = 1,
"Lowest CV error" = 2,
K = 3,
)
)|>
set_caption(
as_paragraph(
as_chunk(
"Ks with lowest cross-validation errors for 5 runs of Admixture (47k SNPs)",
props = fp_text_default(
font.fam = "Cambria"
)
)
),
word_stylena = "Table Caption"
)
#
# check the table
autofit(ftbadmix)
Run | Lowest CV error | K |
---|---|---|
1 | 0.66722 | 14 |
2 | 0.66783 | 16 |
3 | 0.66818 | 14 |
4 | 0.66658 | 15 |
5 | 0.66733 | 15 |
#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
prop_section(
page_size = page_size(
orient = "portrait",
width = 8.3,
height = 11.7
),
type = "continuous",
page_margins = page_mar()
)
# save
# then you can open MS Word and make adjustments
ftbadmix |>
autofit() |>
save_as_docx(
path = here(
"analyses", "europe", "admixture_europe_CV_k1_k40_r1.docx"
),
pr_section = sect_properties
)
We will use 2000 bootstraps, and cv=10.
admixture europe.bed $name -j20 --cv=10 --B2000 # we will run with 1000 bootstraps, cross-validation 10, using 20 threads
Now generate the full script for dsq.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
for run in $(seq 1 5); do
for k in $(seq 1 40); do
echo "cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/run$run; \
module load ADMIXTURE/1.3.0; \
admixture -s $run --cv=10 -B1000 -j20 /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.bed $k | tee log_k$k.txt"
done
done > dsqr_01.txt # jobs
Generate the dsq batch files on the cluster.
module load dSQ/1.05
# make script
dsq \
--job-file dsqr_01.txt \
--output /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01/logs/admix-%A_%3a-%N.txt \
--mem-per-cpu 5g \
-t 24:00:00 \
--cpus-per-task=10 \
--mail-type ALL \
--partition=scavenge \
--job-name admixture_europer01 \
--batch-file admixture_europer01.sh
sed -i '6 i #SBATCH --requeue' admixture_europer01.sh;
#check it
head admixture_europer01.sh
Before we submit all jobs, it is a good practice to submit only one job of the array and see if it runs without problems.
# create a directory for the logs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
mkdir logs
#
# submit 1the first job to see if it works.
sbatch --array=0 admixture_europer01.sh
# Submitted batch job
# check status
dsqa -j 10388171 # if no errors, we can go ahead and submit all jobs
If it runs without problems, we can submit all jobs.
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
dsqa -j 10389785 -f dsqr_01.txt -s TIMEOUT > rerun_jobs.txt; wc -l rerun_jobs.txt
# all completed!
For the re-runs
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
dsqa -j 10768022 -f dsqr_01.txt -s TIMEOUT > rerun_jobs.txt; wc -l rerun_jobs.txt
# all completed!
check if the seed is different for each run
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
# first get how many jobs we have
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | wc -l
# 200
# then count uniq seeds
grep 'seed' run*/*.txt | awk '{print $3}'| sort -nk1 | uniq | wc -l
# 5
# if the output does not match, then we have duplicated seeds. Next, we would need the repeated seeds would be in the same run for different ks, we only can't have the same seed in different run for the same k.
grep -H 'seed' run*/*.txt
## 200
## 5
## run1/log_k1.txt:Random seed: 1
## run1/log_k10.txt:Random seed: 1
## run1/log_k11.txt:Random seed: 1
## run1/log_k12.txt:Random seed: 1
## run1/log_k13.txt:Random seed: 1
## run1/log_k14.txt:Random seed: 1
## run1/log_k15.txt:Random seed: 1
## run1/log_k16.txt:Random seed: 1
## run1/log_k17.txt:Random seed: 1
## run1/log_k18.txt:Random seed: 1
## run1/log_k19.txt:Random seed: 1
## run1/log_k2.txt:Random seed: 1
## run1/log_k20.txt:Random seed: 1
## run1/log_k21.txt:Random seed: 1
## run1/log_k22.txt:Random seed: 1
## run1/log_k23.txt:Random seed: 1
## run1/log_k24.txt:Random seed: 1
## run1/log_k25.txt:Random seed: 1
## run1/log_k26.txt:Random seed: 1
## run1/log_k27.txt:Random seed: 1
## run1/log_k28.txt:Random seed: 1
## run1/log_k29.txt:Random seed: 1
## run1/log_k3.txt:Random seed: 1
## run1/log_k30.txt:Random seed: 1
## run1/log_k31.txt:Random seed: 1
## run1/log_k32.txt:Random seed: 1
## run1/log_k33.txt:Random seed: 1
## run1/log_k34.txt:Random seed: 1
## run1/log_k35.txt:Random seed: 1
## run1/log_k36.txt:Random seed: 1
## run1/log_k37.txt:Random seed: 1
## run1/log_k38.txt:Random seed: 1
## run1/log_k39.txt:Random seed: 1
## run1/log_k4.txt:Random seed: 1
## run1/log_k40.txt:Random seed: 1
## run1/log_k5.txt:Random seed: 1
## run1/log_k6.txt:Random seed: 1
## run1/log_k7.txt:Random seed: 1
## run1/log_k8.txt:Random seed: 1
## run1/log_k9.txt:Random seed: 1
## run2/log_k1.txt:Random seed: 2
## run2/log_k10.txt:Random seed: 2
## run2/log_k11.txt:Random seed: 2
## run2/log_k12.txt:Random seed: 2
## run2/log_k13.txt:Random seed: 2
## run2/log_k14.txt:Random seed: 2
## run2/log_k15.txt:Random seed: 2
## run2/log_k16.txt:Random seed: 2
## run2/log_k17.txt:Random seed: 2
## run2/log_k18.txt:Random seed: 2
## run2/log_k19.txt:Random seed: 2
## run2/log_k2.txt:Random seed: 2
## run2/log_k20.txt:Random seed: 2
## run2/log_k21.txt:Random seed: 2
## run2/log_k22.txt:Random seed: 2
## run2/log_k23.txt:Random seed: 2
## run2/log_k24.txt:Random seed: 2
## run2/log_k25.txt:Random seed: 2
## run2/log_k26.txt:Random seed: 2
## run2/log_k27.txt:Random seed: 2
## run2/log_k28.txt:Random seed: 2
## run2/log_k29.txt:Random seed: 2
## run2/log_k3.txt:Random seed: 2
## run2/log_k30.txt:Random seed: 2
## run2/log_k31.txt:Random seed: 2
## run2/log_k32.txt:Random seed: 2
## run2/log_k33.txt:Random seed: 2
## run2/log_k34.txt:Random seed: 2
## run2/log_k35.txt:Random seed: 2
## run2/log_k36.txt:Random seed: 2
## run2/log_k37.txt:Random seed: 2
## run2/log_k38.txt:Random seed: 2
## run2/log_k39.txt:Random seed: 2
## run2/log_k4.txt:Random seed: 2
## run2/log_k40.txt:Random seed: 2
## run2/log_k5.txt:Random seed: 2
## run2/log_k6.txt:Random seed: 2
## run2/log_k7.txt:Random seed: 2
## run2/log_k8.txt:Random seed: 2
## run2/log_k9.txt:Random seed: 2
## run3/log_k1.txt:Random seed: 3
## run3/log_k10.txt:Random seed: 3
## run3/log_k11.txt:Random seed: 3
## run3/log_k12.txt:Random seed: 3
## run3/log_k13.txt:Random seed: 3
## run3/log_k14.txt:Random seed: 3
## run3/log_k15.txt:Random seed: 3
## run3/log_k16.txt:Random seed: 3
## run3/log_k17.txt:Random seed: 3
## run3/log_k18.txt:Random seed: 3
## run3/log_k19.txt:Random seed: 3
## run3/log_k2.txt:Random seed: 3
## run3/log_k20.txt:Random seed: 3
## run3/log_k21.txt:Random seed: 3
## run3/log_k22.txt:Random seed: 3
## run3/log_k23.txt:Random seed: 3
## run3/log_k24.txt:Random seed: 3
## run3/log_k25.txt:Random seed: 3
## run3/log_k26.txt:Random seed: 3
## run3/log_k27.txt:Random seed: 3
## run3/log_k28.txt:Random seed: 3
## run3/log_k29.txt:Random seed: 3
## run3/log_k3.txt:Random seed: 3
## run3/log_k30.txt:Random seed: 3
## run3/log_k31.txt:Random seed: 3
## run3/log_k32.txt:Random seed: 3
## run3/log_k33.txt:Random seed: 3
## run3/log_k34.txt:Random seed: 3
## run3/log_k35.txt:Random seed: 3
## run3/log_k36.txt:Random seed: 3
## run3/log_k37.txt:Random seed: 3
## run3/log_k38.txt:Random seed: 3
## run3/log_k39.txt:Random seed: 3
## run3/log_k4.txt:Random seed: 3
## run3/log_k40.txt:Random seed: 3
## run3/log_k5.txt:Random seed: 3
## run3/log_k6.txt:Random seed: 3
## run3/log_k7.txt:Random seed: 3
## run3/log_k8.txt:Random seed: 3
## run3/log_k9.txt:Random seed: 3
## run4/log_k1.txt:Random seed: 4
## run4/log_k10.txt:Random seed: 4
## run4/log_k11.txt:Random seed: 4
## run4/log_k12.txt:Random seed: 4
## run4/log_k13.txt:Random seed: 4
## run4/log_k14.txt:Random seed: 4
## run4/log_k15.txt:Random seed: 4
## run4/log_k16.txt:Random seed: 4
## run4/log_k17.txt:Random seed: 4
## run4/log_k18.txt:Random seed: 4
## run4/log_k19.txt:Random seed: 4
## run4/log_k2.txt:Random seed: 4
## run4/log_k20.txt:Random seed: 4
## run4/log_k21.txt:Random seed: 4
## run4/log_k22.txt:Random seed: 4
## run4/log_k23.txt:Random seed: 4
## run4/log_k24.txt:Random seed: 4
## run4/log_k25.txt:Random seed: 4
## run4/log_k26.txt:Random seed: 4
## run4/log_k27.txt:Random seed: 4
## run4/log_k28.txt:Random seed: 4
## run4/log_k29.txt:Random seed: 4
## run4/log_k3.txt:Random seed: 4
## run4/log_k30.txt:Random seed: 4
## run4/log_k31.txt:Random seed: 4
## run4/log_k32.txt:Random seed: 4
## run4/log_k33.txt:Random seed: 4
## run4/log_k34.txt:Random seed: 4
## run4/log_k35.txt:Random seed: 4
## run4/log_k36.txt:Random seed: 4
## run4/log_k37.txt:Random seed: 4
## run4/log_k38.txt:Random seed: 4
## run4/log_k39.txt:Random seed: 4
## run4/log_k4.txt:Random seed: 4
## run4/log_k40.txt:Random seed: 4
## run4/log_k5.txt:Random seed: 4
## run4/log_k6.txt:Random seed: 4
## run4/log_k7.txt:Random seed: 4
## run4/log_k8.txt:Random seed: 4
## run4/log_k9.txt:Random seed: 4
## run5/log_k1.txt:Random seed: 5
## run5/log_k10.txt:Random seed: 5
## run5/log_k11.txt:Random seed: 5
## run5/log_k12.txt:Random seed: 5
## run5/log_k13.txt:Random seed: 5
## run5/log_k14.txt:Random seed: 5
## run5/log_k15.txt:Random seed: 5
## run5/log_k16.txt:Random seed: 5
## run5/log_k17.txt:Random seed: 5
## run5/log_k18.txt:Random seed: 5
## run5/log_k19.txt:Random seed: 5
## run5/log_k2.txt:Random seed: 5
## run5/log_k20.txt:Random seed: 5
## run5/log_k21.txt:Random seed: 5
## run5/log_k22.txt:Random seed: 5
## run5/log_k23.txt:Random seed: 5
## run5/log_k24.txt:Random seed: 5
## run5/log_k25.txt:Random seed: 5
## run5/log_k26.txt:Random seed: 5
## run5/log_k27.txt:Random seed: 5
## run5/log_k28.txt:Random seed: 5
## run5/log_k29.txt:Random seed: 5
## run5/log_k3.txt:Random seed: 5
## run5/log_k30.txt:Random seed: 5
## run5/log_k31.txt:Random seed: 5
## run5/log_k32.txt:Random seed: 5
## run5/log_k33.txt:Random seed: 5
## run5/log_k34.txt:Random seed: 5
## run5/log_k35.txt:Random seed: 5
## run5/log_k36.txt:Random seed: 5
## run5/log_k37.txt:Random seed: 5
## run5/log_k38.txt:Random seed: 5
## run5/log_k39.txt:Random seed: 5
## run5/log_k4.txt:Random seed: 5
## run5/log_k40.txt:Random seed: 5
## run5/log_k5.txt:Random seed: 5
## run5/log_k6.txt:Random seed: 5
## run5/log_k7.txt:Random seed: 5
## run5/log_k8.txt:Random seed: 5
## run5/log_k9.txt:Random seed: 5
Collect the cross validation information from the all the log files. We have to do it for each run. We need to get the CV and the K from each log file.
Check log files
# navigate to the data directory
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
# let's check the log files using grep. We need to get the line with the CV values. We can grep CV
grep CV run1/log*.txt | head
#run1/log_k10.txt:CV error (K=10): 0.74723
#run1/log_k11.txt:CV error (K=11): 0.74521
#run1/log_k12.txt:CV error (K=12): 0.74563
Make loop to get summary from all 5 runs for r<0.01
# on the cluster, we can build the loop step by step as well. For example, we can see if our loop is getting the files we need. Then, instead of a command, we can use echo $i to list the files. We need to use "| head", otherwise it will list all 4,000 files.
for i in $(ls -1 run*/log*.txt); do
echo $i
done | head -n 2
#run1/log_k10.txt
#run1/log_k11.txt
# now we can replace echo with our loop from the previous chunk, and save it as a file.
# remember, we now have to use grep and not echo
# first let test to command. Sometimes the tools from Mac OS and Linux are different. We have to add the flag -H for grep to print the file path, we need it to get the run number.
for i in $(ls -1 run*/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done | head -n 2
#1 10 0.74723
#1 11 0.74521
# it works, now we can run the loop and write the results in a file.
for i in $(ls -1 run*/log*.txt); do
grep -H CV $i | sed 's|run| |' | sed 's|/log_k| |' | sed 's|.txt:CV error (K=| |' | sed 's|):||' | awk '{print $1, $3, $4}'
done > cross_validation_r01.txt
# we should have 200 lines in our file
wc -l cross_validation_r01.txt
# 200 cross_validation.txt
head cross_validation_r01.txt
# it looks okay
cd /gpfs/gibbs/pi/caccone/mkc54/albo/europe/admixture/r_01
cp -r cross_validation_r01.txt /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/europe
Import the cross validation values into R
# the files are in the output directory
## ............................................................................
#cd /vast/palmer/scratch/caccone/mkc54/albo/admixture_native
#working directory /gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/analyses/admixture_native
cross_val <-
read_delim(
here(
"analyses", "europe", "cross_validation_r01.txt"
),
col_names = FALSE,
show_col_types = FALSE,
col_types = "iin"
) |>
rename(
run = 1,
k = 2,
cv = 3
) |>
mutate(
run = as.factor(
run
)
)
Make plot of the cross-validation errors for 5 runs using 2000 bootstraps and 10 fold cross validation
## ............................................................................
## 1000 bootstraps and 10 folds cv ####
#
# make plot
cross_val |>
ggplot() +
geom_line(
aes(
x = k,
y = cv,
group = run,
color = run,
linetype = run
),
linewidth = .75
) +
labs(
x = "K",
y = "Cross-validation error",
title = "Admixture Cross-validation for Europe (17K SNPs)",
subtitle = "2000 bootstraps and cv = 10 ",
caption = "algorithm runs for choices of K ranging from 1 to 40"
) +
scale_color_manual(
values = c(
"#C1CDCD", "#1AC722", "#FF8C00", "#87CEFA", "#FF3030"
)
) +
theme(
panel.grid.major = element_line(
linetype = "dashed",
linewidth = 0.2,
),
legend.title = element_text(
size = 14,
face = "bold"
),
panel.grid.minor = element_line(
linetype = "dashed",
linewidth = 0.2
)
)
ggsave(
here(
"analyses", "europe", "admixture_r01_2000Bootstraps_CV_k1_k40.pdf"
),
width = 8,
height = 6,
units = "in"
)
Now we can find the K with the lowest cross-validation error
## 1000 bootstraps and 10 folds cv #
# to find the lowest cv error for each run
cross_val |>
group_by(
run
) |>
summarize(
LowestCVerror = min(
cv,
na.rm = T
)
) |>
arrange(
run
) -> cv_min
#
# we can also get the k with the lowest cv error
cross_val |>
group_by(
run
) |>
slice(
which.min(
cv
)
) |>
arrange(
run
) -> k_with_lowest_cv
#
# we can left_joint the objects and make a table
admix_ks <-
left_join(
cv_min,
k_with_lowest_cv,
by = "run"
) |>
select(
-cv
)
#
# make table
ftbadmix <-
flextable(
admix_ks |>
rename(
Run = 1,
"Lowest CV error" = 2,
K = 3,
)
)|>
set_caption(
as_paragraph(
as_chunk(
"Ks with lowest cross-validation errors for 5 runs of Admixture (17k SNPs)",
props = fp_text_default(
font.fam = "Cambria"
)
)
),
word_stylena = "Table Caption"
)
#
# check the table
autofit(ftbadmix)
Run | Lowest CV error | K |
---|---|---|
1 | 0.74408 | 16 |
2 | 0.74251 | 14 |
3 | 0.74269 | 14 |
4 | 0.74246 | 13 |
5 | 0.74310 | 12 |
#
# save the table
# create settings - check ??save_as_docx on console
sect_properties <-
prop_section(
page_size = page_size(
orient = "portrait",
width = 8.3,
height = 11.7
),
type = "continuous",
page_margins = page_mar()
)
# save
# then you can open MS Word and make adjustments
ftbadmix |>
autofit() |>
save_as_docx(
path = here(
"analyses", "europe", "admixture_europe_CV_k1_k40_r01.docx"
),
pr_section = sect_properties
)