Change to select matrix from best LEA run
leak20 <- read_delim(
here("euro_global/output/snps_sets/r2_0.1.snmf/K20/run1/r2_0.1_r1.20.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak20)
## # A tibble: 6 × 20
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.467 5.65e-3 2.91e-2 2.08e-2 2.06e-2 2.30e-2 7.48e-2 7.89e-3 5.27e-2 3.98e-2
## 2 0.311 2.01e-2 8.34e-2 1.82e-2 2.75e-2 2.10e-2 1.22e-1 1.00e-4 4.56e-2 1.27e-2
## 3 0.990 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## 4 0.845 9.99e-5 3.00e-4 9.99e-5 6.99e-3 9.99e-5 2.16e-2 6.00e-2 9.99e-5 1.42e-2
## 5 0.878 9.99e-5 9.99e-5 9.99e-5 1.74e-2 4.22e-3 1.23e-2 5.27e-2 2.50e-3 1.17e-2
## 6 0.998 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## # ℹ 10 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## # X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>
The fam file
fam_file <- here(
"euro_global/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Change ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1001 OKI 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02 2.29631e-02
## 2 1002 OKI 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02 2.10350e-02
## 3 1003 OKI 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1004 OKI 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03 9.99280e-05
## 5 1005 OKI 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02 4.21508e-03
## 6 1006 OKI 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## X7 X8 X9 X10 X11 X12
## 1 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03 1.01131e-02
## 2 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02 9.99730e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03 1.23783e-02
## 5 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## X13 X14 X15 X16 X17 X18
## 1 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03 5.56498e-02
## 2 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02 4.49698e-02
## 3 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05 9.99280e-05
## 5 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## X19 X20
## 1 1.78205e-03 4.34181e-03
## 2 7.05643e-03 9.99730e-05
## 3 9.98380e-05 9.98380e-05
## 4 1.36630e-02 9.99280e-05
## 5 1.01074e-02 9.99262e-05
## 6 9.98289e-05 9.98289e-05
Rename the columns
# Rename the columns starting from the third one
leak20 <- leak20 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak20)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1001 OKI 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02 2.29631e-02
## 2 1002 OKI 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02 2.10350e-02
## 3 1003 OKI 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1004 OKI 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03 9.99280e-05
## 5 1005 OKI 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02 4.21508e-03
## 6 1006 OKI 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v7 v8 v9 v10 v11 v12
## 1 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03 1.01131e-02
## 2 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02 9.99730e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03 1.23783e-02
## 5 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v13 v14 v15 v16 v17 v18
## 1 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03 5.56498e-02
## 2 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02 4.49698e-02
## 3 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05 9.99280e-05
## 5 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04 9.99262e-05
## 6 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v19 v20
## 1 1.78205e-03 4.34181e-03
## 2 7.05643e-03 9.99730e-05
## 3 9.98380e-05 9.98380e-05
## 4 1.36630e-02 9.99280e-05
## 5 1.01074e-02 9.99262e-05
## 6 9.98289e-05 9.98289e-05
Import samples attributes
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
selected<-subset (sampling_loc, !is.na(sampling_loc[,10]))
pops <- selected |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
pops <- pops |> filter(!is.na(order))
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
pops <- pops |> filter(!is.na(order))
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
Merge with pops
# Add an index column to Q_tibble
leak20$index <- seq_len(nrow(leak20))
# Perform the merge as before
df1 <-
merge(
leak20,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5
## 411 OKI 1001 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02
## 412 OKI 1002 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02
## 413 OKI 1003 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 OKI 1004 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03
## 415 OKI 1005 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02
## 416 OKI 1006 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v6 v7 v8 v9 v10 v11
## 411 2.29631e-02 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03
## 412 2.10350e-02 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.99280e-05 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03
## 415 4.21508e-03 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v12 v13 v14 v15 v16 v17
## 411 1.01131e-02 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03
## 412 9.99730e-05 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05
## 414 1.23783e-02 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05
## 415 9.99262e-05 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v18 v19 v20 Latitude Longitude Pop_City Country
## 411 5.56498e-02 1.78205e-03 4.34181e-03 26.5013 127.9454 Okinawa Japan
## 412 4.49698e-02 7.05643e-03 9.99730e-05 26.5013 127.9454 Okinawa Japan
## 413 9.98380e-05 9.98380e-05 9.98380e-05 26.5013 127.9454 Okinawa Japan
## 414 9.99280e-05 1.36630e-02 9.99280e-05 26.5013 127.9454 Okinawa Japan
## 415 9.99262e-05 1.01074e-02 9.99262e-05 26.5013 127.9454 Okinawa Japan
## 416 9.98289e-05 9.98289e-05 9.98289e-05 26.5013 127.9454 Okinawa Japan
## Region Subregion Year order
## 411 East Asia 2018? 54
## 412 East Asia 2018? 54
## 413 East Asia 2018? 54
## 414 East Asia 2018? 54
## 415 East Asia 2018? 54
## 416 East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
Load libraries for pie-chart maps
## Registered S3 method overwritten by 'ggforce':
## method from
## scale_type.units units
##
## Attaching package: 'scatterpie'
## The following object is masked from 'package:sp':
##
## recenter
##
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
Create color palette
color_palette_20 <-
c(
"#FF8C1A",
"#B20CC9",
"#77DD77",
"#1E90FF",
"#B22222",
"green",
"yellow2",
"#F49AC2",
"blue",
"#008080",
"purple4",
"#FFFF99",
"#75FAFF",
"#AE9393",
"magenta",
"green4",
"navy",
"#AE8333",
"purple",
"orangered"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_20) +
guides(fill = "none") + # Hide legend
coord_sf() +
#coord_sf(xlim = c(-90, 145), ylim = c(-30, 60)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
with SNP Set 2 LEA matrix
df1 <- df1 |>
filter(
Region == "Eastern Europe" | Region == "Southern Europe" | Region == "Western Europe"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_20) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
my_theme()
Filter out Asia from the matrix
# Add an index column to Q_tibble
leak20$index <- seq_len(nrow(leak20))
# Perform the merge as before
df1 <-
merge(
leak20,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5
## 411 OKI 1001 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02
## 412 OKI 1002 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02
## 413 OKI 1003 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 OKI 1004 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03
## 415 OKI 1005 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02
## 416 OKI 1006 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v6 v7 v8 v9 v10 v11
## 411 2.29631e-02 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03
## 412 2.10350e-02 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.99280e-05 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03
## 415 4.21508e-03 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v12 v13 v14 v15 v16 v17
## 411 1.01131e-02 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03
## 412 9.99730e-05 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05
## 414 1.23783e-02 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05
## 415 9.99262e-05 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v18 v19 v20 Latitude Longitude Pop_City Country
## 411 5.56498e-02 1.78205e-03 4.34181e-03 26.5013 127.9454 Okinawa Japan
## 412 4.49698e-02 7.05643e-03 9.99730e-05 26.5013 127.9454 Okinawa Japan
## 413 9.98380e-05 9.98380e-05 9.98380e-05 26.5013 127.9454 Okinawa Japan
## 414 9.99280e-05 1.36630e-02 9.99280e-05 26.5013 127.9454 Okinawa Japan
## 415 9.99262e-05 1.01074e-02 9.99262e-05 26.5013 127.9454 Okinawa Japan
## 416 9.98289e-05 9.98289e-05 9.98289e-05 26.5013 127.9454 Okinawa Japan
## Region Subregion Year order
## 411 East Asia 2018? 54
## 412 East Asia 2018? 54
## 413 East Asia 2018? 54
## 414 East Asia 2018? 54
## 415 East Asia 2018? 54
## 416 East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
df1 <- df1 |>
filter(
Region == "East Asia" | Region == "South Asia" | Region == "Southeast Asia"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_20) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 60)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Filter out America from the matrix
# Add an index column to Q_tibble
leak20$index <- seq_len(nrow(leak20))
# Perform the merge as before
df1 <-
merge(
leak20,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5
## 411 OKI 1001 0.466583 5.64551e-03 2.91126e-02 2.07730e-02 2.06457e-02
## 412 OKI 1002 0.310824 2.01057e-02 8.33858e-02 1.82188e-02 2.74903e-02
## 413 OKI 1003 0.989852 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 OKI 1004 0.844984 9.99280e-05 3.00413e-04 9.99280e-05 6.98909e-03
## 415 OKI 1005 0.878223 9.99262e-05 9.99262e-05 9.99262e-05 1.73929e-02
## 416 OKI 1006 0.998103 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v6 v7 v8 v9 v10 v11
## 411 2.29631e-02 7.47906e-02 7.89411e-03 5.27420e-02 3.98423e-02 8.68477e-03
## 412 2.10350e-02 1.22019e-01 9.99730e-05 4.56474e-02 1.26797e-02 2.23758e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.99280e-05 2.15622e-02 6.00394e-02 9.99280e-05 1.41907e-02 3.47921e-03
## 415 4.21508e-03 1.22851e-02 5.26514e-02 2.50057e-03 1.16512e-02 4.90798e-03
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v12 v13 v14 v15 v16 v17
## 411 1.01131e-02 9.37350e-02 4.03754e-02 1.32796e-02 2.91868e-02 1.85944e-03
## 412 9.99730e-05 1.60111e-01 5.60752e-02 2.10341e-02 8.74184e-03 1.79302e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 8.35050e-03 9.98380e-05 9.98380e-05
## 414 1.23783e-02 1.11324e-02 1.59733e-03 8.88493e-03 9.99280e-05 9.99280e-05
## 415 9.99262e-05 4.65299e-03 9.99262e-05 9.99262e-05 9.99262e-05 5.12692e-04
## 416 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05 9.98289e-05
## v18 v19 v20 Latitude Longitude Pop_City Country
## 411 5.56498e-02 1.78205e-03 4.34181e-03 26.5013 127.9454 Okinawa Japan
## 412 4.49698e-02 7.05643e-03 9.99730e-05 26.5013 127.9454 Okinawa Japan
## 413 9.98380e-05 9.98380e-05 9.98380e-05 26.5013 127.9454 Okinawa Japan
## 414 9.99280e-05 1.36630e-02 9.99280e-05 26.5013 127.9454 Okinawa Japan
## 415 9.99262e-05 1.01074e-02 9.99262e-05 26.5013 127.9454 Okinawa Japan
## 416 9.98289e-05 9.98289e-05 9.98289e-05 26.5013 127.9454 Okinawa Japan
## Region Subregion Year order
## 411 East Asia 2018? 54
## 412 East Asia 2018? 54
## 413 East Asia 2018? 54
## 414 East Asia 2018? 54
## 415 East Asia 2018? 54
## 416 East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
df1 <- df1 |>
filter(
Region == "North America" | Region == "South America"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19","v20"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_20) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-120, -15), ylim = c(-30, 60)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
replace with best Q matrix
leak5 <- read_delim(
here("euro_global/output/snps_sets/r2_0.1.snmf/K5/run1/r2_0.1_r1.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0100 0.432 0.201 0.137 0.220
## 2 0.0200 0.435 0.204 0.121 0.220
## 3 0.0157 0.464 0.175 0.147 0.198
## 4 0.000100 0.452 0.226 0.0883 0.233
## 5 0.000100 0.462 0.199 0.109 0.230
## 6 0.0234 0.457 0.192 0.116 0.211
The fam file
fam_file <- here(
"euro_global/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Change ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5
## 1 1001 OKI 0.01004940 0.432031 0.201044 0.1370630 0.219812
## 2 1002 OKI 0.01995590 0.434927 0.203757 0.1211740 0.220186
## 3 1003 OKI 0.01567250 0.464427 0.174829 0.1474360 0.197636
## 4 1004 OKI 0.00009999 0.452051 0.226127 0.0883153 0.233406
## 5 1005 OKI 0.00009999 0.462282 0.199097 0.1087080 0.229812
## 6 1006 OKI 0.02340520 0.457148 0.191711 0.1164300 0.211305
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak5)
## ind pop v1 v2 v3 v4 v5
## 1 1001 OKI 0.01004940 0.432031 0.201044 0.1370630 0.219812
## 2 1002 OKI 0.01995590 0.434927 0.203757 0.1211740 0.220186
## 3 1003 OKI 0.01567250 0.464427 0.174829 0.1474360 0.197636
## 4 1004 OKI 0.00009999 0.452051 0.226127 0.0883153 0.233406
## 5 1005 OKI 0.00009999 0.462282 0.199097 0.1087080 0.229812
## 6 1006 OKI 0.02340520 0.457148 0.191711 0.1164300 0.211305
Import samples attributes for euro_global
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
pops <- pops |> filter(!is.na(order))
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
Merge with pops
# Add an index column to Q_tibble
leak5$index <- seq_len(nrow(leak5))
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 411 OKI 1001 0.01004940 0.432031 0.201044 0.1370630 0.219812 26.5013 127.9454
## 412 OKI 1002 0.01995590 0.434927 0.203757 0.1211740 0.220186 26.5013 127.9454
## 413 OKI 1003 0.01567250 0.464427 0.174829 0.1474360 0.197636 26.5013 127.9454
## 414 OKI 1004 0.00009999 0.452051 0.226127 0.0883153 0.233406 26.5013 127.9454
## 415 OKI 1005 0.00009999 0.462282 0.199097 0.1087080 0.229812 26.5013 127.9454
## 416 OKI 1006 0.02340520 0.457148 0.191711 0.1164300 0.211305 26.5013 127.9454
## Pop_City Country Region Subregion Year order
## 411 Okinawa Japan East Asia 2018? 54
## 412 Okinawa Japan East Asia 2018? 54
## 413 Okinawa Japan East Asia 2018? 54
## 414 Okinawa Japan East Asia 2018? 54
## 415 Okinawa Japan East Asia 2018? 54
## 416 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
Make a new palette with only 5 colors
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_5) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-20, 150), ylim = c(-30, 60)) +
my_theme()
df1 <- df1 |>
filter(
Region == "Eastern Europe" | Region == "Southern Europe" | Region == "Western Europe"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_5) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-15, 50), ylim = c(30, 55)) +
my_theme()
Filter out Asia from the matrix
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 411 OKI 1001 0.01004940 0.432031 0.201044 0.1370630 0.219812 26.5013 127.9454
## 412 OKI 1002 0.01995590 0.434927 0.203757 0.1211740 0.220186 26.5013 127.9454
## 413 OKI 1003 0.01567250 0.464427 0.174829 0.1474360 0.197636 26.5013 127.9454
## 414 OKI 1004 0.00009999 0.452051 0.226127 0.0883153 0.233406 26.5013 127.9454
## 415 OKI 1005 0.00009999 0.462282 0.199097 0.1087080 0.229812 26.5013 127.9454
## 416 OKI 1006 0.02340520 0.457148 0.191711 0.1164300 0.211305 26.5013 127.9454
## Pop_City Country Region Subregion Year order
## 411 Okinawa Japan East Asia 2018? 54
## 412 Okinawa Japan East Asia 2018? 54
## 413 Okinawa Japan East Asia 2018? 54
## 414 Okinawa Japan East Asia 2018? 54
## 415 Okinawa Japan East Asia 2018? 54
## 416 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_5) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 55)) +
my_theme()
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 411 OKI 1001 0.01004940 0.432031 0.201044 0.1370630 0.219812 26.5013 127.9454
## 412 OKI 1002 0.01995590 0.434927 0.203757 0.1211740 0.220186 26.5013 127.9454
## 413 OKI 1003 0.01567250 0.464427 0.174829 0.1474360 0.197636 26.5013 127.9454
## 414 OKI 1004 0.00009999 0.452051 0.226127 0.0883153 0.233406 26.5013 127.9454
## 415 OKI 1005 0.00009999 0.462282 0.199097 0.1087080 0.229812 26.5013 127.9454
## 416 OKI 1006 0.02340520 0.457148 0.191711 0.1164300 0.211305 26.5013 127.9454
## Pop_City Country Region Subregion Year order
## 411 Okinawa Japan East Asia 2018? 54
## 412 Okinawa Japan East Asia 2018? 54
## 413 Okinawa Japan East Asia 2018? 54
## 414 Okinawa Japan East Asia 2018? 54
## 415 Okinawa Japan East Asia 2018? 54
## 416 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
Filter out all but Americas from the matrix
Make pie plots
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_5) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-130, -15), ylim = c(-35, 50)) +
my_theme()
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
selected<-subset (sampling_loc, !is.na(sampling_loc[,10]))
pops <- selected |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
Select a Q matrix from one of the runs for the best k
# Extract ancestry coefficients
k19run4 <- read_delim(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/fastStructure/r_1/run004/simple.19.meanQ" #copy one of Q matricies with best k to here and rename it
),
delim = " ",
# Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
k19run4 <- k19run4 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(k19run4)
## # A tibble: 6 × 19
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 0 0.264 0 0 0 0 0.0450 0 0
## 2 0 0 0 0 0 0.0669 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## # ℹ 7 more variables: X13 <dbl>, X14 <dbl>, X15 <dbl>, X16 <dbl>, X17 <dbl>,
## # X18 <dbl>, X19 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13
## 1 1001 OKI 0 0 0 0 0.2643 0.000000 0 0 0 0.044959 0 0 0.078525
## 2 1002 OKI 0 0 0 0 0.0000 0.066895 0 0 0 0.000000 0 0 0.000000
## 3 1003 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 4 1004 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 5 1005 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 6 1006 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## X14 X15 X16 X17 X18 X19
## 1 0.612210 0.000000 0 0.000000 0.000000 0
## 2 0.449903 0.219687 0 0.172857 0.090652 0
## 3 0.999991 0.000000 0 0.000000 0.000000 0
## 4 0.999991 0.000000 0 0.000000 0.000000 0
## 5 0.999991 0.000000 0 0.000000 0.000000 0
## 6 0.999991 0.000000 0 0.000000 0.000000 0
Rename the columns
# Rename the columns starting from the third one
k19run4 <- k19run4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(k19run4)
## ind pop v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13
## 1 1001 OKI 0 0 0 0 0.2643 0.000000 0 0 0 0.044959 0 0 0.078525
## 2 1002 OKI 0 0 0 0 0.0000 0.066895 0 0 0 0.000000 0 0 0.000000
## 3 1003 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 4 1004 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 5 1005 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 6 1006 OKI 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## v14 v15 v16 v17 v18 v19
## 1 0.612210 0.000000 0 0.000000 0.000000 0
## 2 0.449903 0.219687 0 0.172857 0.090652 0
## 3 0.999991 0.000000 0 0.000000 0.000000 0
## 4 0.999991 0.000000 0 0.000000 0.000000 0
## 5 0.999991 0.000000 0 0.000000 0.000000 0
## 6 0.999991 0.000000 0 0.000000 0.000000 0
Merge with pops
# Add an index column to Q_tibble
k19run4$index <- seq_len(nrow(k19run4))
# Perform the merge as before
df1 <-
merge(
k19run4,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7 v8 v9 v10 v11 v12 v13
## 411 OKI 1001 0 0 0 0 0.2643 0.000000 0 0 0 0.044959 0 0 0.078525
## 412 OKI 1002 0 0 0 0 0.0000 0.066895 0 0 0 0.000000 0 0 0.000000
## 413 OKI 1003 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 414 OKI 1004 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 415 OKI 1005 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## 416 OKI 1006 0 0 0 0 0.0000 0.000000 0 0 0 0.000000 0 0 0.000000
## v14 v15 v16 v17 v18 v19 Latitude Longitude Pop_City
## 411 0.612210 0.000000 0 0.000000 0.000000 0 26.5013 127.9454 Okinawa
## 412 0.449903 0.219687 0 0.172857 0.090652 0 26.5013 127.9454 Okinawa
## 413 0.999991 0.000000 0 0.000000 0.000000 0 26.5013 127.9454 Okinawa
## 414 0.999991 0.000000 0 0.000000 0.000000 0 26.5013 127.9454 Okinawa
## 415 0.999991 0.000000 0 0.000000 0.000000 0 26.5013 127.9454 Okinawa
## 416 0.999991 0.000000 0 0.000000 0.000000 0 26.5013 127.9454 Okinawa
## Country Region Subregion Year order
## 411 Japan East Asia 2018? 54
## 412 Japan East Asia 2018? 54
## 413 Japan East Asia 2018? 54
## 414 Japan East Asia 2018? 54
## 415 Japan East Asia 2018? 54
## 416 Japan East Asia 2018? 54
Need a color palette with at least 19 colors (for K=19) I’m using the same palette from the PCA
color_palette_19 <-
c(
"#AE9333",
"#FFB347",
"#77DD77",
"#779ECB",
"#1E90FF",
"#B22222",
"#FF8C1A",
"#B20CC9",
"#F49AC2",
"blue",
"#008080",
"purple4",
"#FFFF99",
"#75FAFF",
"#AE9393",
"magenta",
"green4",
"navy",
"green"
)
Make pie plot
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "RMarkdowns", "analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_19) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-15, 52), ylim = c(30, 60)) +
my_theme()
Change to select matrix from best LEA run
leak22 <- read_delim(
here("euro_global/output/snps_sets/r2_0.01_b.snmf/K22/run5/r2_0.01_b_r5.22.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak22)
## # A tibble: 6 × 22
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.13e-1 9.99e-5 9.99e-5 9.99e-5 0.494 5.05e-2 9.99e-5 1.79e-3 3.43e-2 5.14e-2
## 2 4.86e-2 1.00e-4 1.03e-2 2.78e-3 0.319 1.03e-3 3.56e-2 8.74e-3 6.26e-2 6.63e-2
## 3 9.98e-5 9.98e-5 9.98e-5 9.98e-5 0.980 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## 4 9.99e-5 9.99e-5 5.88e-3 3.40e-2 0.817 9.99e-5 1.80e-2 9.99e-5 5.80e-2 2.02e-2
## 5 9.99e-5 9.99e-5 9.99e-5 3.84e-3 0.844 5.71e-3 1.21e-2 9.99e-5 6.06e-2 6.65e-3
## 6 9.98e-5 9.98e-5 9.98e-5 9.98e-5 0.998 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## # ℹ 12 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## # X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>, X21 <dbl>, X22 <dbl>
The fam file
fam_file <- here(
"euro_global/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Change ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1001 OKI 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779 5.05374e-02
## 2 1002 OKI 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323 1.02586e-03
## 3 1003 OKI 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697 9.98380e-05
## 4 1004 OKI 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342 9.98920e-05
## 5 1005 OKI 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145 5.71364e-03
## 6 1006 OKI 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904 9.98108e-05
## X7 X8 X9 X10 X11 X12
## 1 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02 2.66775e-02
## 2 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02 3.31449e-02
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02 9.98920e-05
## 5 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## X13 X14 X15 X16 X17 X18
## 1 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02 9.00488e-03
## 2 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05 4.38346e-03
## 4 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03 9.98920e-05
## 5 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## X19 X20 X21 X22
## 1 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05
## 2 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05
## 4 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05
## 5 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
Rename the columns
# Rename the columns starting from the third one
leak22 <- leak22 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak22)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1001 OKI 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779 5.05374e-02
## 2 1002 OKI 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323 1.02586e-03
## 3 1003 OKI 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697 9.98380e-05
## 4 1004 OKI 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342 9.98920e-05
## 5 1005 OKI 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145 5.71364e-03
## 6 1006 OKI 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904 9.98108e-05
## v7 v8 v9 v10 v11 v12
## 1 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02 2.66775e-02
## 2 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02 3.31449e-02
## 3 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 4 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02 9.98920e-05
## 5 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v13 v14 v15 v16 v17 v18
## 1 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02 9.00488e-03
## 2 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05 4.38346e-03
## 4 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03 9.98920e-05
## 5 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v19 v20 v21 v22
## 1 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05
## 2 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05
## 3 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05
## 4 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05
## 5 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05
## 6 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
Import samples attributes
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
selected<-subset (sampling_loc, !is.na(sampling_loc[,10]))
pops <- selected |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
pops <- pops |> filter(!is.na(order))
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
Merge with pops
# Add an index column to Q_tibble
leak22$index <- seq_len(nrow(leak22))
# Perform the merge as before
df1 <-
merge(
leak22,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5
## 411 OKI 1001 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779
## 412 OKI 1002 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323
## 413 OKI 1003 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697
## 414 OKI 1004 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342
## 415 OKI 1005 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145
## 416 OKI 1006 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904
## v6 v7 v8 v9 v10 v11
## 411 5.05374e-02 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02
## 412 1.02586e-03 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.98920e-05 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02
## 415 5.71364e-03 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v12 v13 v14 v15 v16 v17
## 411 2.66775e-02 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02
## 412 3.31449e-02 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05
## 414 9.98920e-05 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03
## 415 9.98830e-05 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v18 v19 v20 v21 v22 Latitude
## 411 9.00488e-03 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05 26.5013
## 412 9.99550e-05 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05 26.5013
## 413 4.38346e-03 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05 26.5013
## 414 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 26.5013
## 415 9.98830e-05 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05 26.5013
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 26.5013
## Longitude Pop_City Country Region Subregion Year order
## 411 127.9454 Okinawa Japan East Asia 2018? 54
## 412 127.9454 Okinawa Japan East Asia 2018? 54
## 413 127.9454 Okinawa Japan East Asia 2018? 54
## 414 127.9454 Okinawa Japan East Asia 2018? 54
## 415 127.9454 Okinawa Japan East Asia 2018? 54
## 416 127.9454 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
Load libraries for pie maps
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
Create color palette
color_palette_22 <-
c(
"#FF8C1A",
"#1E90FF",
"#008080",
"#B22222",
"orangered2",
"green",
"#FFFF99",
"navy",
"coral",
"#B20CC9",
"goldenrod3",
"#77DD77",
"#AE9393",
"#F49AC2",
"magenta",
"green4",
"#75FAFF",
"blue",
"yellow2",
"purple4",
"purple",
"chocolate4"
)
## Spherical geometry (s2) switched off
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20", "v21", "v22"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_22) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-90, 145), ylim = c(-30, 60)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df1 <- df1 |>
filter(
Region == "Eastern Europe" | Region == "Southern Europe" | Region == "Western Europe"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20", "v21", "v22"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_22) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-13, 50), ylim = c(31, 53)) +
my_theme()
leak22$index <- seq_len(nrow(leak22))
# Perform the merge as before
df1 <-
merge(
leak22,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5
## 411 OKI 1001 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779
## 412 OKI 1002 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323
## 413 OKI 1003 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697
## 414 OKI 1004 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342
## 415 OKI 1005 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145
## 416 OKI 1006 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904
## v6 v7 v8 v9 v10 v11
## 411 5.05374e-02 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02
## 412 1.02586e-03 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.98920e-05 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02
## 415 5.71364e-03 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v12 v13 v14 v15 v16 v17
## 411 2.66775e-02 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02
## 412 3.31449e-02 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05
## 414 9.98920e-05 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03
## 415 9.98830e-05 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v18 v19 v20 v21 v22 Latitude
## 411 9.00488e-03 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05 26.5013
## 412 9.99550e-05 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05 26.5013
## 413 4.38346e-03 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05 26.5013
## 414 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 26.5013
## 415 9.98830e-05 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05 26.5013
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 26.5013
## Longitude Pop_City Country Region Subregion Year order
## 411 127.9454 Okinawa Japan East Asia 2018? 54
## 412 127.9454 Okinawa Japan East Asia 2018? 54
## 413 127.9454 Okinawa Japan East Asia 2018? 54
## 414 127.9454 Okinawa Japan East Asia 2018? 54
## 415 127.9454 Okinawa Japan East Asia 2018? 54
## 416 127.9454 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19", "v20", "v21", "v22"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_22) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 55)) +
my_theme()
ggsave(
here("scripts", "RMarkdowns", "output", "euro_global", "lea", "MAF_1", "maps", "lea_MAF1_k22_euro_global_ASIA_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Filter out America from the matrix
leak22$index <- seq_len(nrow(leak22))
# Perform the merge as before
df1 <-
merge(
leak22,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5
## 411 OKI 1001 1.12653e-01 9.99280e-05 9.99280e-05 9.99280e-05 0.493779
## 412 OKI 1002 4.86145e-02 9.99550e-05 1.02890e-02 2.77542e-03 0.319323
## 413 OKI 1003 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 0.979697
## 414 OKI 1004 9.98920e-05 9.98920e-05 5.88100e-03 3.40031e-02 0.817342
## 415 OKI 1005 9.98830e-05 9.98830e-05 9.98830e-05 3.84252e-03 0.844145
## 416 OKI 1006 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 0.997904
## v6 v7 v8 v9 v10 v11
## 411 5.05374e-02 9.99280e-05 1.79134e-03 3.42946e-02 5.13805e-02 6.00100e-02
## 412 1.02586e-03 3.56242e-02 8.73696e-03 6.26307e-02 6.63284e-02 1.72237e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05
## 414 9.98920e-05 1.80011e-02 9.98920e-05 5.80196e-02 2.01962e-02 2.09973e-02
## 415 5.71364e-03 1.21276e-02 9.98830e-05 6.05534e-02 6.65074e-03 2.46074e-02
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v12 v13 v14 v15 v16 v17
## 411 2.66775e-02 9.99280e-05 9.99280e-05 1.24226e-02 5.42137e-02 2.79981e-02
## 412 3.31449e-02 9.99550e-05 9.26243e-02 2.57066e-02 1.43312e-01 7.71757e-02
## 413 9.98380e-05 9.98380e-05 9.98380e-05 9.98380e-05 4.15423e-03 9.98380e-05
## 414 9.98920e-05 9.98920e-05 1.39880e-02 9.98920e-05 8.58207e-03 1.79089e-03
## 415 9.98830e-05 9.98830e-05 9.98830e-05 9.98830e-05 2.81391e-02 9.98830e-05
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05
## v18 v19 v20 v21 v22 Latitude
## 411 9.00488e-03 3.64674e-02 2.79699e-02 9.99280e-05 9.99280e-05 26.5013
## 412 9.99550e-05 3.60504e-02 1.89146e-02 9.99550e-05 9.99550e-05 26.5013
## 413 4.38346e-03 9.98380e-05 9.98380e-05 9.96870e-03 9.98380e-05 26.5013
## 414 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 26.5013
## 415 9.98830e-05 9.98830e-05 1.29216e-02 9.98830e-05 9.98830e-05 26.5013
## 416 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 9.98108e-05 26.5013
## Longitude Pop_City Country Region Subregion Year order
## 411 127.9454 Okinawa Japan East Asia 2018? 54
## 412 127.9454 Okinawa Japan East Asia 2018? 54
## 413 127.9454 Okinawa Japan East Asia 2018? 54
## 414 127.9454 Okinawa Japan East Asia 2018? 54
## 415 127.9454 Okinawa Japan East Asia 2018? 54
## 416 127.9454 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
df1 <- df1 |>
filter(
Region == "North America" | Region == "South America"
)
world <- ne_countries(scale = "medium", returnclass = "sf")
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 2.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18","v19","v20", "v21", "v22"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_22) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-120, -22), ylim = c(-30, 55)) +
my_theme()
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 22642
## column count: 697
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 22642
## Character matrix gt cols: 697
## skip: 0
## nrows: 22642
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant: 22642
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD ALU ALV ARM BAR BEN BER BRE BUL CAM CES CHA CRO DES FRS GEL GES GRA GRC GRV
## 10 12 12 10 12 12 12 13 10 12 14 12 12 16 12 2 12 11 10 12
## HAI HAN HOC HUN IMP INJ INW ITB ITP ITR JAF KAC KAG KAN KAT KER KLP KRA KUN LAM
## 12 4 7 12 4 11 4 5 9 12 2 6 12 11 6 12 4 12 4 9
## MAL MAT OKI PAL POL POP QNC RAR REC ROM ROS SER SEV SIC SLO SOC SON SPB SPC SPM
## 12 12 12 11 2 12 11 12 11 4 11 4 12 9 12 12 3 8 6 5
## SPS SSK STS SUF SUU TAI TIK TIR TRE TUA TUH UTS YUN
## 8 12 12 6 6 7 12 4 12 9 12 12 9
## $repetitions
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy 5 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5 5
## K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
## K = 18 K = 19 K = 20 K = 21 K = 22 K = 23 K = 24 K = 25
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
##
## $crossEntropy
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7
## min 0.9295938 0.9035762 0.8929160 0.8866176 0.8831455 0.8810623 0.8793035
## mean 0.9299463 0.9038992 0.8932721 0.8869377 0.8840167 0.8813716 0.8797496
## max 0.9303090 0.9042813 0.8935996 0.8872815 0.8859704 0.8818163 0.8802178
## K = 8 K = 9 K = 10 K = 11 K = 12 K = 13 K = 14
## min 0.8777762 0.8754397 0.8737881 0.8724198 0.8709529 0.8703098 0.8702140
## mean 0.8779980 0.8760114 0.8743412 0.8732523 0.8716021 0.8712855 0.8703966
## max 0.8784509 0.8765061 0.8748110 0.8741085 0.8724610 0.8718638 0.8706670
## K = 15 K = 16 K = 17 K = 18 K = 19 K = 20 K = 21
## min 0.8691706 0.8691407 0.8684490 0.8678589 0.8677646 0.8675384 0.8673978
## mean 0.8697189 0.8695057 0.8689370 0.8686954 0.8680425 0.8681742 0.8678156
## max 0.8701692 0.8700150 0.8695248 0.8697050 0.8682069 0.8685108 0.8686613
## K = 22 K = 23 K = 24 K = 25
## min 0.8669903 0.8671792 0.8666401 0.8673425
## mean 0.8679212 0.8675363 0.8676546 0.8677506
## max 0.8688708 0.8679872 0.8685890 0.8681663
# get the cross-entropy of all runs for K = 7
ce = cross.entropy(project, K = 7)
ce #run 5 is best for k=7
## K = 7
## run 1 0.8793121
## run 2 0.8800735
## run 3 0.8802178
## run 4 0.8798409
## run 5 0.8793035
change to correct matrix
leak7 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b.snmf/K7/run5/r2_0.01_b_r5.7.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak7)
## # A tibble: 6 × 7
## X1 X2 X3 X4 X5 X6 X7
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0756 0.0293 0.0751 0.0622 0.0574 0.0800 0.620
## 2 0.0512 0.0620 0.0632 0.0807 0.0627 0.125 0.556
## 3 0.000100 0.000100 0.0497 0.0150 0.0934 0.0266 0.815
## 4 0.0159 0.00257 0.0664 0.0291 0.0366 0.0927 0.757
## 5 0.000100 0.000100 0.0483 0.0153 0.0423 0.0944 0.800
## 6 0.000100 0.000100 0.0239 0.00847 0.0909 0.0414 0.835
The fam file
fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Change ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1001 OKI 0.075587600 0.029332800 0.0750965 0.06223380 0.0574401 0.0799922
## 2 1002 OKI 0.051196200 0.062006400 0.0631862 0.08066190 0.0626840 0.1246690
## 3 1003 OKI 0.000099982 0.000099982 0.0496697 0.01503730 0.0933872 0.0266181
## 4 1004 OKI 0.015876700 0.002571750 0.0663977 0.02908000 0.0366296 0.0927472
## 5 1005 OKI 0.000099982 0.000099982 0.0482861 0.01532970 0.0423157 0.0943542
## 6 1006 OKI 0.000099982 0.000099982 0.0239108 0.00847465 0.0909052 0.0413913
## X7
## 1 0.620317
## 2 0.555597
## 3 0.815088
## 4 0.756697
## 5 0.799514
## 6 0.835118
Rename the columns
# Rename the columns starting from the third one
leak7 <- leak7 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak7)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1001 OKI 0.075587600 0.029332800 0.0750965 0.06223380 0.0574401 0.0799922
## 2 1002 OKI 0.051196200 0.062006400 0.0631862 0.08066190 0.0626840 0.1246690
## 3 1003 OKI 0.000099982 0.000099982 0.0496697 0.01503730 0.0933872 0.0266181
## 4 1004 OKI 0.015876700 0.002571750 0.0663977 0.02908000 0.0366296 0.0927472
## 5 1005 OKI 0.000099982 0.000099982 0.0482861 0.01532970 0.0423157 0.0943542
## 6 1006 OKI 0.000099982 0.000099982 0.0239108 0.00847465 0.0909052 0.0413913
## v7
## 1 0.620317
## 2 0.555597
## 3 0.815088
## 4 0.756697
## 5 0.799514
## 6 0.835118
Import samples attributes for euro_global
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
pops <- pops |> filter(!is.na(order))
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
Merge with pops
# Add an index column to Q_tibble
leak7$index <- seq_len(nrow(leak7))
# Perform the merge as before
df1 <-
merge(
leak7,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6
## 411 OKI 1001 0.075587600 0.029332800 0.0750965 0.06223380 0.0574401 0.0799922
## 412 OKI 1002 0.051196200 0.062006400 0.0631862 0.08066190 0.0626840 0.1246690
## 413 OKI 1003 0.000099982 0.000099982 0.0496697 0.01503730 0.0933872 0.0266181
## 414 OKI 1004 0.015876700 0.002571750 0.0663977 0.02908000 0.0366296 0.0927472
## 415 OKI 1005 0.000099982 0.000099982 0.0482861 0.01532970 0.0423157 0.0943542
## 416 OKI 1006 0.000099982 0.000099982 0.0239108 0.00847465 0.0909052 0.0413913
## v7 Latitude Longitude Pop_City Country Region Subregion Year
## 411 0.620317 26.5013 127.9454 Okinawa Japan East Asia 2018?
## 412 0.555597 26.5013 127.9454 Okinawa Japan East Asia 2018?
## 413 0.815088 26.5013 127.9454 Okinawa Japan East Asia 2018?
## 414 0.756697 26.5013 127.9454 Okinawa Japan East Asia 2018?
## 415 0.799514 26.5013 127.9454 Okinawa Japan East Asia 2018?
## 416 0.835118 26.5013 127.9454 Okinawa Japan East Asia 2018?
## order
## 411 54
## 412 54
## 413 54
## 414 54
## 415 54
## 416 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
make a new palette with only 7 colors
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "RMarkdowns", "analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_7) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_7) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-20, 150), ylim = c(-30, 60)) +
my_theme()
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_7) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 55)) +
my_theme()
library(LEA)
library(vcfR)
genotype <- here(
"euro_global/output/snps_sets/r2_0.01_b.vcf"
)
d <- read.vcfR(
genotype
)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 22642
## column count: 697
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 22642
## Character matrix gt cols: 697
## skip: 0
## nrows: 22642
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant: 22642
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD ALU ALV ARM BAR BEN BER BRE BUL CAM CES CHA CRO DES FRS GEL GES GRA GRC GRV
## 10 12 12 10 12 12 12 13 10 12 14 12 12 16 12 2 12 11 10 12
## HAI HAN HOC HUN IMP INJ INW ITB ITP ITR JAF KAC KAG KAN KAT KER KLP KRA KUN LAM
## 12 4 7 12 4 11 4 5 9 12 2 6 12 11 6 12 4 12 4 9
## MAL MAT OKI PAL POL POP QNC RAR REC ROM ROS SER SEV SIC SLO SOC SON SPB SPC SPM
## 12 12 12 11 2 12 11 12 11 4 11 4 12 9 12 12 3 8 6 5
## SPS SSK STS SUF SUU TAI TIK TIR TRE TUA TUH UTS YUN
## 8 12 12 6 6 7 12 4 12 9 12 12 9
## $repetitions
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy 5 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5 5
## K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
## K = 18 K = 19 K = 20 K = 21 K = 22 K = 23 K = 24 K = 25
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
##
## $crossEntropy
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7
## min 0.9295938 0.9035762 0.8929160 0.8866176 0.8831455 0.8810623 0.8793035
## mean 0.9299463 0.9038992 0.8932721 0.8869377 0.8840167 0.8813716 0.8797496
## max 0.9303090 0.9042813 0.8935996 0.8872815 0.8859704 0.8818163 0.8802178
## K = 8 K = 9 K = 10 K = 11 K = 12 K = 13 K = 14
## min 0.8777762 0.8754397 0.8737881 0.8724198 0.8709529 0.8703098 0.8702140
## mean 0.8779980 0.8760114 0.8743412 0.8732523 0.8716021 0.8712855 0.8703966
## max 0.8784509 0.8765061 0.8748110 0.8741085 0.8724610 0.8718638 0.8706670
## K = 15 K = 16 K = 17 K = 18 K = 19 K = 20 K = 21
## min 0.8691706 0.8691407 0.8684490 0.8678589 0.8677646 0.8675384 0.8673978
## mean 0.8697189 0.8695057 0.8689370 0.8686954 0.8680425 0.8681742 0.8678156
## max 0.8701692 0.8700150 0.8695248 0.8697050 0.8682069 0.8685108 0.8686613
## K = 22 K = 23 K = 24 K = 25
## min 0.8669903 0.8671792 0.8666401 0.8673425
## mean 0.8679212 0.8675363 0.8676546 0.8677506
## max 0.8688708 0.8679872 0.8685890 0.8681663
# get the cross-entropy of all runs for K = 6
ce = cross.entropy(project, K = 6)
ce #run 5 is best for k=6
## K = 6
## run 1 0.8811239
## run 2 0.8814691
## run 3 0.8818163
## run 4 0.8813863
## run 5 0.8810623
change to correct matrix
leak6 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/euro_global/output/snps_sets/r2_0.01_b.snmf/K6/run5/r2_0.01_b_r5.6.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak6)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.174 0.172 0.150 0.350 0.131 0.0241
## 2 0.205 0.197 0.170 0.293 0.121 0.0131
## 3 0.160 0.156 0.123 0.400 0.117 0.0444
## 4 0.199 0.204 0.119 0.389 0.0885 0.000100
## 5 0.180 0.218 0.113 0.404 0.0847 0.000100
## 6 0.203 0.187 0.0976 0.404 0.108 0.000100
The fam file
fam_file <- here("euro_global/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 OKI 1001 0 0 2 -9
## 2 OKI 1002 0 0 2 -9
## 3 OKI 1003 0 0 2 -9
## 4 OKI 1004 0 0 2 -9
## 5 OKI 1005 0 0 2 -9
## 6 OKI 1006 0 0 1 -9
Change ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1001 OKI
## 2 1002 OKI
## 3 1003 OKI
## 4 1004 OKI
## 5 1005 OKI
## 6 1006 OKI
Add it to matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1001 OKI 0.173708 0.171848 0.1501010 0.349694 0.1305770 0.024071800
## 2 1002 OKI 0.205148 0.197109 0.1701940 0.292993 0.1214940 0.013061600
## 3 1003 OKI 0.159720 0.156319 0.1226130 0.399926 0.1169840 0.044438900
## 4 1004 OKI 0.198631 0.204137 0.1192340 0.389391 0.0885066 0.000099991
## 5 1005 OKI 0.179729 0.218195 0.1134380 0.403849 0.0846894 0.000099991
## 6 1006 OKI 0.202770 0.187071 0.0975617 0.404205 0.1082920 0.000099991
Rename the columns
# Rename the columns starting from the third one
leak6 <- leak6 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak6)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1001 OKI 0.173708 0.171848 0.1501010 0.349694 0.1305770 0.024071800
## 2 1002 OKI 0.205148 0.197109 0.1701940 0.292993 0.1214940 0.013061600
## 3 1003 OKI 0.159720 0.156319 0.1226130 0.399926 0.1169840 0.044438900
## 4 1004 OKI 0.198631 0.204137 0.1192340 0.389391 0.0885066 0.000099991
## 5 1005 OKI 0.179729 0.218195 0.1134380 0.403849 0.0846894 0.000099991
## 6 1006 OKI 0.202770 0.187071 0.0975617 0.404205 0.1082920 0.000099991
Import samples attributes for euro_global
sampling_loc <- readRDS(here("scripts", "RMarkdowns", "output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
pops <- pops |> filter(!is.na(order))
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region Subregion
## 1 BER 39.79081 -74.9291 Berlin, NJ USA North America
## 2 COL 39.97170 -82.9071 Columbus, OH USA North America
## 3 PAL 26.70560 -80.0364 Palm Beach USA North America
## 4 HOU 29.75491 -95.3505 Houston, TX USA North America
## 5 LOS 34.05220 -118.2437 Los Angeles USA North America
## 6 MAU -3.09161 -60.0325 Manaus, AM Brazil South America
## Year order
## 1 2018 1
## 2 2015 2
## 3 2018 3
## 4 2018 4
## 5 2018 5
## 6 2017 6
Merge with pops
# Add an index column to Q_tibble
leak6$index <- seq_len(nrow(leak6))
# Perform the merge as before
df1 <-
merge(
leak6,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6
## 411 OKI 1001 0.173708 0.171848 0.1501010 0.349694 0.1305770 0.024071800
## 412 OKI 1002 0.205148 0.197109 0.1701940 0.292993 0.1214940 0.013061600
## 413 OKI 1003 0.159720 0.156319 0.1226130 0.399926 0.1169840 0.044438900
## 414 OKI 1004 0.198631 0.204137 0.1192340 0.389391 0.0885066 0.000099991
## 415 OKI 1005 0.179729 0.218195 0.1134380 0.403849 0.0846894 0.000099991
## 416 OKI 1006 0.202770 0.187071 0.0975617 0.404205 0.1082920 0.000099991
## Latitude Longitude Pop_City Country Region Subregion Year order
## 411 26.5013 127.9454 Okinawa Japan East Asia 2018? 54
## 412 26.5013 127.9454 Okinawa Japan East Asia 2018? 54
## 413 26.5013 127.9454 Okinawa Japan East Asia 2018? 54
## 414 26.5013 127.9454 Okinawa Japan East Asia 2018? 54
## 415 26.5013 127.9454 Okinawa Japan East Asia 2018? 54
## 416 26.5013 127.9454 Okinawa Japan East Asia 2018? 54
df1$Country[df1$Country == 'USA'] <- 'United States of America' #to get USA outline to show up
df1$Country[df1$Country == 'Serbia'] <- 'Republic of Serbia'
make a new palette with only 6 colors
Make pie plot
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"scripts", "RMarkdowns", "analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_6) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_6) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-20, 150), ylim = c(-30, 60)) +
my_theme()
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_6) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(60, 150), ylim = c(-10, 46)) +
my_theme()
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_6) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-100, 150), ylim = c(-40, 60)) +
my_theme()
# Load necessary libraries
library(ggplot2)
library(scatterpie)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggrepel)
library(Cairo)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here("scripts", "RMarkdowns",
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = world, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 2.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.1, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = color_palette_6) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-100, -25), ylim = c(-40, 60)) +
my_theme()