Load LEA Libraries
library(LEA)
library(vcfR)
library(RColorBrewer)
library(ggplot2)
library(adegenet)
library(ape)
library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
library(here)
library(dplyr)
library(ggplot2)
library(colorout)
library(extrafont)
#library(scales)
library(stringr)
library(ggtext)
Check data - we created 2 vcf files with LD pruning r2<0.01 (LD1) and r2<0.1 (LD2) after QC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1_b.vcf
Import the data
genotype <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.vcf"
)
d <- read.vcfR(
genotype
)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 17028
## column count: 418
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 17028
## Character matrix gt cols: 418
## skip: 0
## nrows: 17028
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant: 17028
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ITB ITP ITR KER KRA
## 10 12 12 10 12 13 10 14 12 16 12 12 11 10 4 5 8 12 12 12
## MAL POL POP RAR ROM ROS SCH SER SEV SIC SLO SOC SPB SPC SPM SPS STS TIK TIR TRE
## 12 2 12 12 4 11 5 4 12 9 12 12 8 6 5 8 7 12 4 12
## TUA TUH
## 9 12
Convert format
##
## - number of detected individuals: 409
## - number of detected loci: 17028
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.removed file, for more informations.
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.geno"
##
## - number of detected individuals: 409
## - number of detected loci: 17028
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.removed file, for more informations.
##
##
## - number of detected individuals: 409
## - number of detected loci: 17028
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.lfmm"
PCA
## [1] "******************************"
## [1] " Principal Component Analysis "
## [1] "******************************"
## summary of the options:
##
## -n (number of individuals) 409
## -L (number of loci) 17028
## -K (number of principal components) 409
## -x (genotype file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.lfmm
## -a (eigenvalue file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.pca/r2_0.01.eigenvalues
## -e (eigenvector file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.pca/r2_0.01.eigenvectors
## -d (standard deviation file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.pca/r2_0.01.sdev
## -p (projection file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.pca/r2_0.01.projections
## -c data centered
## * pca class *
##
## project directory: /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/
## pca result directory: r2_0.01.pca/
## input file: r2_0.01.lfmm
## eigenvalue file: r2_0.01.eigenvalues
## eigenvector file: r2_0.01.eigenvectors
## standard deviation file: r2_0.01.sdev
## projection file: r2_0.01.projections
## pcaProject file: r2_0.01.pcaProject
## number of individuals: 409
## number of loci: 17028
## number of principal components: 409
## centered: TRUE
## scaled: FALSE
Test
## [1] "*******************"
## [1] " Tracy-Widom tests "
## [1] "*******************"
## summary of the options:
##
## -n (number of eigenvalues) 409
## -i (input file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.pca/r2_0.01.eigenvalues
## -o (output file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.pca/r2_0.01.tracywidom
# tw$pvalues
# plot the percentage of variance explained by each component
plot(tw$percentage, pch = 19, col = "blue", cex = .8)
Get values
# plot preparation
pc.coord <- as.data.frame(pc$projections)
colnames(pc.coord) <- paste0("PC", 1:nPC)
pc.coord$Individual <- inds
pc.coord$Population <- pops
# perc1 <- paste0(round(tw$percentage, digits = 3) * 100, "%")
perc <- paste0(round(pc$eigenvalues/sum(pc$eigenvalues), digits = 3) * 100, "%")
nb.cols <- 40
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
Check R symbols for plot
#to see all shapes -> plot shapes - para escolher os simbolos
N = 100; M = 1000
good.shapes = c(1:25,33:127)
foo = data.frame( x = rnorm(M), y = rnorm(M), s = factor( sample(1:N, M, replace = TRUE) ) )
ggplot(aes(x,y,shape=s ), data=foo ) +
scale_shape_manual(values=good.shapes[1:N]) +
geom_point()
Sample data
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
More sample data
# import sample attributes
samples2 <- read.delim(
here(
"output", "Population_meta_data.txt"
),
head = TRUE
)
samples2<- samples2 |>
dplyr::select(
region, pop
)
# check head of the file
head(samples2)
## region pop
## 1 Central Africa GAB
## 2 East Africa ANT
## 3 East Africa MAD
## 4 East Africa DGV
## 5 East Africa VOH
## 6 Indian Ocean DAU
Merge with sampling_loc
## pop region Pop_City Country Latitude Longitude
## 1 AIZ East Asia Aizuwakamatsu City Japan 37.49240 139.99360
## 2 ALD Southern Europe Durres Albania 41.29704 19.50373
## 3 ALU Eastern Europe Alushta Ukraine 44.68289 34.40368
## 4 ALV Southern Europe Vlore Albania 40.46600 19.48970
## 5 ANT East Africa Antananarivo Madagascar -18.87920 47.50790
## 6 ARM Eastern Europe Ijevan Armenia 40.87971 45.14764
## Continent Year Region Subregion order
## 1 Asia 2008 East Asia NA
## 2 Europe 2018 Southern Europe East Europe 25
## 3 Europe 2021 Eastern Europe East Europe 35
## 4 Europe 2020 Southern Europe East Europe 24
## 5 Africa 2022 East Africa East Africa 76
## 6 Europe 2020 Eastern Europe East Europe 42
Check pops
## [1] SOC SOC SOC SOC SOC SOC
## 42 Levels: ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ... TUH
## [1] 42
Merge
## Population PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 ALD -8.21430 -17.0067 28.2696 9.13237 -7.31486 1.890010 -0.303143
## 2 ALD -9.13508 -19.8738 32.8077 8.18154 -7.01175 5.301700 2.202660
## 3 ALD -8.04118 -24.3374 23.9759 11.16210 -7.67688 8.073790 -0.733297
## 4 ALD -7.92996 -22.0723 29.2841 4.24730 -8.75483 -0.403150 1.070740
## 5 ALD -8.49166 -17.2414 27.7908 3.52261 -8.80533 0.302187 -3.636510
## 6 ALD -7.50251 -20.3057 32.3615 8.74798 -6.94644 5.061250 1.217390
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## 1 4.09303 -5.80809 -3.709560 0.340379 -6.659530 7.8617200 -0.155011
## 2 2.85175 -8.14562 -6.049230 -3.323350 -13.233800 6.5543700 -1.123520
## 3 3.19734 -3.61882 -4.182580 -0.574487 -13.392300 2.5982300 2.764160
## 4 -1.52651 -6.18699 -2.091380 -2.647460 -0.434361 0.0953076 3.042900
## 5 -1.55109 -1.34862 0.198293 4.631020 -5.894140 7.2943000 0.378519
## 6 4.08356 -1.83300 -5.563490 1.911290 -6.986750 6.7547500 -0.200228
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## 1 1.1693200 -13.465100 -4.348240 2.796960 0.773428 -5.14010 -5.538200
## 2 -1.9538100 -12.043700 -1.569030 -0.674060 -1.114400 -3.77136 -4.653530
## 3 1.5221100 -9.973820 -0.185831 -0.672766 -1.305490 -7.79603 -4.500970
## 4 2.2270800 -0.587941 -1.661670 -3.119830 -1.202460 -5.73875 1.384530
## 5 -0.0613865 -8.643070 3.755050 -1.498680 -12.751700 7.49621 -12.576000
## 6 -2.5741100 -9.995150 -1.963430 3.296590 -1.819570 -8.54063 -0.858732
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## 1 -6.150350 4.46461 -4.20617 -1.158800 7.999000 -5.25016 -0.8701300
## 2 -0.534259 6.12881 -3.15805 -1.470200 6.923950 2.48864 -3.0674100
## 3 -2.324530 8.83578 -1.64695 -1.507920 5.698790 5.33068 -5.2078400
## 4 8.011850 -4.55014 -2.79673 -5.531640 -1.072080 9.12268 -4.2403800
## 5 -8.699800 4.18978 -1.93863 -0.569616 6.248740 -1.52298 -0.0891593
## 6 -4.380330 5.92297 -4.29010 8.664100 0.272357 -12.27350 0.7177290
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## 1 1.37910 -1.745880 2.954480 -8.22670 -13.475100 5.515340 19.53730
## 2 2.03222 -2.924660 1.550940 -3.31742 -8.701000 0.367712 10.80790
## 3 7.70501 0.445822 -1.186230 -3.41866 -1.014930 1.650480 11.67870
## 4 5.44991 -3.063320 2.376240 -5.29248 4.418640 -5.460070 6.56030
## 5 -6.46921 1.233780 0.397917 -4.24628 -3.748890 0.830447 6.39316
## 6 -14.61470 -0.653956 11.037400 -6.03417 0.627226 0.122318 -7.83948
## PC36 PC37 PC38 PC39 PC40 PC41 PC42 PC43
## 1 -7.476680 3.03550 7.161320 -1.26451 -5.843340 -5.82907 -1.957670 -11.59780
## 2 -7.992050 1.22789 3.760240 1.11028 -4.446560 -3.26460 -5.697130 -12.20050
## 3 -6.802090 -9.10229 0.363573 3.90878 -5.891140 -1.79057 -6.483830 -9.54966
## 4 -2.859490 6.55288 0.184628 -1.76356 0.415696 -9.25532 1.675850 4.23674
## 5 0.872882 -3.45116 5.300350 -2.22796 -2.091820 -2.59142 -0.669171 -9.19518
## 6 3.366990 4.31551 9.177580 2.61583 1.638170 15.02420 -4.686230 -1.55279
## PC44 PC45 PC46 PC47 PC48 PC49 PC50 PC51
## 1 4.93427 -3.527360 -5.0297900 -2.00367 -8.19202 -5.19308 3.36911 1.568220
## 2 5.96476 -7.812250 0.0167475 3.23424 -7.30426 -5.89624 -3.20332 -0.111356
## 3 4.72461 -9.417550 -1.6067200 2.87252 -10.45130 -5.25914 -5.21889 7.096440
## 4 -8.77900 0.624766 5.3657500 5.60253 -8.11357 -2.43234 7.93654 -1.838640
## 5 -3.10289 7.588550 -4.3675000 -4.38606 -4.50486 -6.83405 -2.46012 -1.830640
## 6 -5.60741 -1.368620 -4.4485800 -7.88933 21.93140 19.75190 -9.37870 -7.400100
## PC52 PC53 PC54 PC55 PC56 PC57 PC58 PC59
## 1 8.73487 1.98185 2.77680 -3.711990 1.87623 5.93608 -1.30318 2.20732
## 2 8.07883 2.49164 -1.87091 1.150950 3.30965 2.61319 10.41730 3.80067
## 3 2.14523 6.21600 -3.70002 -0.082636 6.56256 -2.37239 3.96816 -2.77049
## 4 5.71657 -2.39088 2.49533 4.529190 -3.73839 -3.04026 -3.29971 3.65263
## 5 4.60808 2.45392 0.65054 5.448840 -5.90652 5.82723 -2.80007 -1.25902
## 6 -9.70864 19.84640 20.16870 -8.978190 -2.54694 13.99430 -1.88523 -10.85290
## PC60 PC61 PC62 PC63 PC64 PC65 PC66 PC67
## 1 0.8403890 0.0860503 2.631670 -1.341010 8.64843 2.64101 -6.327900 -2.66791
## 2 0.5562170 -6.7963200 -0.217424 -0.540262 7.41227 5.32107 -0.248502 1.16441
## 3 1.0614300 -6.6099000 -4.560190 3.968260 3.07214 11.37720 -0.936651 9.63202
## 4 0.0975172 -8.9200400 1.642310 -9.591700 4.52055 1.34152 5.794380 3.97242
## 5 -0.6996120 -5.4308600 -6.551620 -8.522930 2.54995 -4.87942 -3.077160 1.80362
## 6 10.5835000 10.5678000 -1.402210 7.691210 7.07757 -3.47068 7.362540 -4.40558
## PC68 PC69 PC70 PC71 PC72 PC73 PC74 PC75
## 1 0.581937 -3.025220 2.47705 3.34854 -6.941990 -2.806830 0.941934 3.26959
## 2 4.808660 -0.595184 -2.83204 -3.47092 0.250346 5.507440 3.464790 2.63176
## 3 12.413600 10.863300 4.43932 -6.96800 -7.742750 2.010910 0.685804 5.69240
## 4 -3.752570 5.258450 2.35995 -2.02310 9.879500 -0.298271 6.042140 -3.33906
## 5 -0.594457 -5.465430 5.92170 -6.88881 6.262450 8.464230 -3.459710 2.40651
## 6 5.965840 1.897960 -8.92849 -3.56880 -0.927347 3.359840 8.025100 1.83391
## PC76 PC77 PC78 PC79 PC80 PC81 PC82 PC83
## 1 -5.01183 -0.687023 -4.05192 7.774820 1.16232 0.621487 2.215860 -8.64293
## 2 -9.65555 3.490720 -5.87214 0.166213 -3.06799 -3.601370 3.575120 -4.53117
## 3 -5.09731 3.711560 -4.95909 0.576973 -3.71681 4.511980 0.627223 -7.27335
## 4 -3.82822 -7.135400 -5.18120 -1.556700 5.23768 3.374450 13.024300 -1.92244
## 5 -10.02120 6.815360 1.24144 -6.541480 6.83672 -6.194910 5.115070 -7.61900
## 6 10.11950 1.116130 -2.22708 13.593900 0.45390 6.322950 -2.277880 -2.87058
## PC84 PC85 PC86 PC87 PC88 PC89 PC90
## 1 -2.811320 6.20563 -3.09796 8.460010 -3.358480 2.26225 4.56707000
## 2 0.457555 5.64885 2.48296 -1.858680 -4.423230 1.62632 0.73417300
## 3 3.960100 8.39713 7.23460 1.112500 -8.590660 1.24793 -1.98252000
## 4 -7.170460 1.90260 2.59909 -0.772832 11.351400 -11.33550 -0.00122312
## 5 -1.834780 -3.46358 4.18304 5.553840 3.176330 7.49356 1.78229000
## 6 -1.129220 2.01556 2.96210 2.336060 -0.729554 -4.00622 6.25459000
## PC91 PC92 PC93 PC94 PC95 PC96 PC97
## 1 3.858180 7.33960 7.07050 -0.0306813 -9.329420 -0.599571 3.105120
## 2 -3.931240 9.44069 -4.33882 2.2528300 0.848863 1.730050 -0.186474
## 3 -10.413400 8.73796 -2.15057 -0.8582610 1.796410 -1.571090 -1.264820
## 4 5.116830 7.21459 5.85536 5.6571600 -5.070530 -5.554030 -5.692770
## 5 7.329640 -7.14342 -1.39165 8.6504000 5.396360 4.523580 5.284220
## 6 -0.062367 6.79089 -4.55338 0.2920530 3.590300 -0.983969 -6.064830
## PC98 PC99 PC100 PC101 PC102 PC103 PC104 PC105
## 1 -1.823400 6.06250 4.04595 -1.03710 -3.62009 -1.46811 -4.35710 4.765850
## 2 6.296790 4.63017 2.53228 3.73988 -4.43679 -4.13884 -3.00113 2.516860
## 3 12.454800 8.27796 3.95510 -2.68587 1.87550 2.43799 -2.73167 -5.499660
## 4 0.241615 -2.39100 -3.97580 3.14852 6.68784 12.46180 -2.31687 0.383566
## 5 1.691330 -5.03442 4.53664 -9.47128 1.11702 -2.43120 13.03890 0.384216
## 6 -8.578470 6.74765 5.56778 4.82964 -3.34173 -2.81847 1.01273 4.551640
## PC106 PC107 PC108 PC109 PC110 PC111 PC112
## 1 -2.85218 -1.02711 1.318390 -8.378890 -6.39503 2.2062300 1.0138400
## 2 -3.44372 7.12482 0.994501 -1.482180 -5.45210 3.1500300 3.5735400
## 3 3.18661 3.32629 12.169000 -4.192980 -7.09760 -1.9997400 1.9796400
## 4 -2.28832 7.72335 3.667690 4.621870 3.65979 -3.9101500 -0.0636025
## 5 -4.04526 -8.02478 0.737849 -0.228484 -5.91016 1.6303800 -1.8987000
## 6 1.81022 -8.13505 -4.307860 3.270970 1.40957 -0.0452829 -0.0718173
## PC113 PC114 PC115 PC116 PC117 PC118 PC119
## 1 5.249500 -2.5191300 11.602200 -2.46354 5.416960 6.571470 -9.6012700
## 2 0.266868 -6.1386700 5.599020 1.10915 -0.953544 0.889206 -0.1090520
## 3 1.055680 -0.0150301 -4.570900 6.39166 -5.966830 1.916920 6.6026900
## 4 -4.508010 -0.8979010 9.050240 -7.12015 2.065660 1.874780 -0.0685059
## 5 -0.317001 -0.1788230 2.601050 1.35997 4.564100 6.911970 3.4354300
## 6 -4.062980 2.0930800 -0.124238 4.59316 3.020750 1.002430 -1.2878800
## PC120 PC121 PC122 PC123 PC124 PC125 PC126
## 1 3.510610 6.86304 -0.0462903 0.980389 0.7166110 -4.18127 -6.65868
## 2 3.169750 -4.12698 -2.7021000 2.213470 -1.5826200 -3.35444 -2.67264
## 3 -2.424830 -3.53187 4.6940100 6.276760 -4.3071600 6.43292 -1.46590
## 4 -6.474050 9.40797 -7.7650700 11.177500 3.6744300 1.19932 -5.92493
## 5 -6.418300 -13.39770 8.9784500 -4.584540 1.5266500 5.45493 -6.15213
## 6 -0.697029 7.31466 3.5574000 -3.939760 0.0374255 1.48579 1.30700
## PC127 PC128 PC129 PC130 PC131 PC132 PC133
## 1 10.08810 5.1916000 1.87913 -3.325180 2.78037 3.661360 -0.423874
## 2 11.06520 -6.2155700 -6.58534 -5.476800 -1.76891 0.124370 3.046070
## 3 -5.72853 -10.2119000 -2.45709 -0.340293 -1.63394 -6.280450 0.286809
## 4 7.12268 -2.6467500 10.51310 -3.171430 -7.41093 1.453560 5.798760
## 5 3.38038 0.0563015 18.93330 2.484490 -7.00075 0.810117 1.269480
## 6 -1.46382 -0.0643609 1.22396 -6.287890 -2.79799 3.663360 -3.526200
## PC134 PC135 PC136 PC137 PC138 PC139 PC140 PC141
## 1 -0.201753 -0.372392 1.306280 -0.768750 1.02503 4.48781 4.782960 -4.85312
## 2 4.257210 -3.043080 1.272860 1.881280 -5.46002 -4.30455 0.721187 -6.44852
## 3 2.606200 0.526002 -0.832852 6.042340 4.09749 -11.05980 -1.841290 -7.82649
## 4 0.357232 1.432710 -1.511700 -2.986800 -6.01318 2.72816 0.306502 -2.94844
## 5 7.995250 -6.967830 3.680850 0.378416 3.66684 2.14495 -5.261010 14.18300
## 6 -1.845370 -4.437480 -4.277660 4.705920 6.30570 5.06645 -3.370450 3.71680
## PC142 PC143 PC144 PC145 PC146 PC147 PC148 PC149
## 1 5.703780 3.39721 -0.753911 12.28610 -7.03611 0.44368 -5.398630 0.678251
## 2 -2.708200 5.36907 5.369900 -8.90452 6.89421 -5.71792 -0.114977 6.437240
## 3 -4.084290 -1.96615 -3.052210 -8.03402 1.45953 4.22000 1.502390 -6.983320
## 4 -0.329862 -3.37610 1.247170 6.35532 10.92730 -8.18989 7.555830 5.983900
## 5 9.430530 2.79926 -3.646100 2.50340 -3.20020 -15.84230 -3.943920 1.651970
## 6 -0.518379 3.08677 -2.241830 -3.70905 3.17274 -4.80937 -0.919783 3.569660
## PC150 PC151 PC152 PC153 PC154 PC155 PC156
## 1 -2.952830 6.35495 -0.8449750 -0.720149 2.946130 -4.98160 4.0807300
## 2 -4.768600 -14.89170 2.9857700 2.536260 -7.298720 -3.77620 2.5925400
## 3 3.575240 5.21871 -0.5739830 2.014930 -4.908860 -2.09432 -8.9545200
## 4 0.821516 1.80700 4.1542900 -7.279380 3.247500 4.43529 16.7377000
## 5 6.649030 -8.03133 6.2207400 -2.052390 5.791290 8.69434 -0.0602394
## 6 -6.441170 -2.39214 -0.0370194 2.459010 -0.138666 -3.04297 1.1097900
## PC157 PC158 PC159 PC160 PC161 PC162 PC163
## 1 3.976840 -3.469780 7.5988200 -0.0331047 -5.431150 -7.04277 -4.134690
## 2 0.618990 -11.070500 0.0934573 -8.2632900 -3.918030 2.53659 -0.133549
## 3 -2.336050 -0.533611 -0.5543650 0.6812940 1.979500 -3.10626 -0.428804
## 4 -5.167880 1.152950 -3.5054900 -4.1929200 4.178330 -3.38338 3.323890
## 5 1.419070 5.878510 5.1778700 -7.6417000 3.808200 -5.32098 3.240970
## 6 0.676656 3.581700 -5.7703700 6.1826100 0.854272 -4.79013 -2.636690
## PC164 PC165 PC166 PC167 PC168 PC169 PC170 PC171
## 1 -5.018030 5.531940 3.69468 -1.008200 6.43822 1.07515 -3.98076 2.69420
## 2 0.805952 3.990410 5.75353 -6.329580 -0.91289 6.65209 -2.80097 6.21442
## 3 3.720560 -0.614139 -4.57770 5.312290 1.33526 -2.29389 -3.15247 -8.52614
## 4 0.607954 15.257300 -7.52236 -2.321100 1.38237 2.29240 6.70897 3.63406
## 5 3.533090 -6.556990 7.66968 3.595310 1.46707 -1.44338 5.41633 9.53268
## 6 0.566821 2.998420 2.49277 -0.824117 1.30426 -2.40788 -1.02689 1.27085
## PC172 PC173 PC174 PC175 PC176 PC177 PC178
## 1 -3.792350 6.95176 2.880300 -11.2308000 11.365500 3.142640 -3.491010
## 2 0.266749 4.72441 -0.675487 -1.4060000 3.688430 0.538678 1.859250
## 3 -3.293660 2.24290 -7.683630 -1.2216200 -5.808300 7.189410 2.410650
## 4 7.494520 -8.73579 -5.343900 0.7387800 5.478450 -8.380210 2.936110
## 5 8.901030 -9.69351 -7.011250 9.0163800 3.287310 2.310060 -4.638180
## 6 -1.033850 -7.36770 -0.585029 0.0294332 -0.515474 0.509816 -0.911378
## PC179 PC180 PC181 PC182 PC183 PC184 PC185
## 1 -3.629400 -1.555800 -13.995200 -6.970300 -0.508464 -2.742150 -1.605690
## 2 5.700350 4.708630 -7.694810 0.872169 -3.434440 -5.499800 5.149560
## 3 -1.992510 -5.689260 2.326000 -0.722749 2.789570 7.381750 0.711569
## 4 4.226270 5.722140 -0.373314 -1.701770 -2.294950 -4.549260 -1.146150
## 5 -0.321296 -5.108110 4.440530 -7.249970 -3.050800 3.265260 4.108140
## 6 0.274945 -0.894122 -0.189075 3.831660 0.527425 0.329162 -1.723500
## PC186 PC187 PC188 PC189 PC190 PC191 PC192
## 1 3.124090 -0.754903 2.354970 -3.74628 -1.153440 -15.932200 2.158450
## 2 1.686210 2.419230 -5.819950 11.02600 0.708929 1.236140 -5.122850
## 3 0.371391 -2.785690 0.126351 -2.38104 -3.313110 4.057850 7.774660
## 4 -1.902570 -3.026470 3.627740 0.14961 3.757740 6.139590 -1.920020
## 5 3.309310 5.671110 3.478970 -1.58281 -5.239490 0.839059 -0.152406
## 6 -0.733634 1.654800 1.960650 -3.60872 5.046070 2.471950 2.468140
## PC193 PC194 PC195 PC196 PC197 PC198 PC199
## 1 -3.611070 6.839640 5.981660 1.970660 -8.474360 -0.715287 4.378700
## 2 -3.254640 -5.276700 8.607090 4.227120 -9.824600 4.064270 0.605308
## 3 3.392830 3.363790 0.877238 0.626084 -2.241790 -2.500130 -0.343753
## 4 1.224050 10.282900 7.364710 6.318430 -0.652640 -0.880231 0.875768
## 5 5.591810 -1.667580 -0.921627 -10.390800 3.273590 -5.220590 1.764130
## 6 -0.552844 0.814239 -1.718660 2.407850 -0.170615 4.888830 0.165336
## PC200 PC201 PC202 PC203 PC204 PC205 PC206 PC207
## 1 4.490840 -9.70488 -8.126550 -0.343871 8.22782 -2.394890 -4.26557 -1.0885500
## 2 7.531900 7.35061 -1.150820 13.625000 -7.62512 -4.449760 -1.71075 -0.0745025
## 3 -0.286753 -3.42020 5.992660 -0.337667 -3.27011 2.140320 4.43083 -1.7667600
## 4 -6.376110 1.23706 2.702490 -0.697875 2.44022 -1.326680 7.33054 7.6270400
## 5 -7.499960 -7.46513 5.867150 -9.377160 7.42368 0.606197 -1.81599 -2.3849700
## 6 -3.323890 1.33111 0.658782 0.828527 -4.25638 -1.000660 1.62519 0.3919310
## PC208 PC209 PC210 PC211 PC212 PC213 PC214
## 1 -2.641720 7.30997 -4.979960 10.545800 5.637380 5.532450 3.067230
## 2 3.105580 -2.81218 0.508603 8.632000 0.565548 -7.654850 2.747430
## 3 7.538690 6.47991 -2.744520 0.512156 4.619480 -5.385640 3.207490
## 4 -0.815223 -1.44270 -3.344640 -3.732040 -2.218590 0.162834 2.063710
## 5 9.200890 4.33795 -8.520220 -1.067630 -9.652200 3.658440 -2.361580
## 6 -0.102882 2.94442 -3.914230 -4.677930 -0.271325 -4.172190 0.177016
## PC215 PC216 PC217 PC218 PC219 PC220 PC221
## 1 5.751250 1.199640 -0.903033 8.854710 -1.588090 6.87624 -1.826530
## 2 -2.302080 0.688732 0.525100 -6.875930 0.995497 -3.41362 0.768956
## 3 10.383700 -1.711520 -0.739404 -5.250770 -1.532120 1.97367 -3.191700
## 4 3.071020 1.855850 4.945790 3.549340 5.215430 4.24756 1.450420
## 5 -3.303600 0.457507 0.866485 5.940950 6.208060 -1.51252 4.898420
## 6 0.578008 3.267480 4.752740 -0.572794 -1.797170 -1.56590 1.225430
## PC222 PC223 PC224 PC225 PC226 PC227 PC228
## 1 -2.609400 -3.56131000 0.940693 -3.3055100 -5.028070 -1.675550 -5.08072
## 2 -4.766570 -0.53195200 -5.342730 -5.8206600 8.097490 3.583680 -5.14459
## 3 2.581620 6.40823000 -4.055020 -0.0137062 2.203160 -4.023300 -5.05475
## 4 4.448060 -8.38636000 -0.173054 -4.3530000 -7.734760 9.215610 6.43647
## 5 -0.305186 -0.00939418 1.383930 -0.0301437 1.016080 3.728950 -4.22062
## 6 4.881970 -1.24949000 -2.178420 3.7155800 -0.665552 0.401292 0.67716
## PC229 PC230 PC231 PC232 PC233 PC234 PC235
## 1 -6.8362800 5.826690 6.640640 -6.90616 -5.946300 -4.88631 4.638770
## 2 -1.6826900 8.602850 -9.273150 4.69466 2.414680 -1.74661 3.803330
## 3 0.0113461 -1.548680 0.873715 -3.51404 -0.535504 -3.96993 6.418000
## 4 -3.9290200 -1.480900 -3.604880 2.68369 0.852202 -4.47172 -0.278838
## 5 0.8050560 3.484460 2.894430 -3.02240 -1.415020 4.57320 -2.071600
## 6 -2.7260800 0.411528 -0.790293 1.52781 1.020850 1.89139 -1.364230
## PC236 PC237 PC238 PC239 PC240 PC241 PC242
## 1 1.841980 3.250690 6.9098100 -2.082880 4.77006 -2.669150 -2.88742
## 2 4.297350 -1.271750 -3.3034500 6.999620 -2.15901 -0.629642 -3.92519
## 3 0.646541 3.926200 -1.5822900 2.227810 4.51158 1.495460 -1.12591
## 4 4.566880 -1.829030 2.7081200 1.497500 1.99119 0.445603 5.14844
## 5 -4.533370 1.684060 -2.0557900 2.923500 -9.02161 -6.025340 -4.75014
## 6 -2.393100 -0.958001 0.0611606 0.219682 -1.46948 1.706730 -1.94469
## PC243 PC244 PC245 PC246 PC247 PC248 PC249
## 1 6.557060 -1.498940 4.16838 5.83641 -0.382151 -7.68925 1.6121900
## 2 -0.502386 1.759140 -10.05010 -2.40454 -5.781220 -2.69857 -0.0317713
## 3 -2.323920 1.701680 3.40160 -3.75933 5.174740 1.34890 3.6327800
## 4 3.388250 2.652910 -4.28316 -2.66549 4.951720 5.23120 -2.7143000
## 5 3.033380 -0.218249 5.01119 -1.66312 -2.287920 3.68705 1.8756100
## 6 1.170960 -1.752770 -2.49073 -4.33073 -0.312279 0.75174 0.2817000
## PC250 PC251 PC252 PC253 PC254 PC255 PC256
## 1 0.764144 -5.872290 -7.363430 -3.082300 0.391021 -5.305750 -4.369200
## 2 -4.165070 -2.076960 -0.973685 5.171000 -3.374870 2.106330 0.752396
## 3 -9.479530 0.810119 4.522640 -6.119270 1.634050 -0.664667 -5.561520
## 4 -5.724000 -5.833530 2.523340 -3.203230 -9.161950 -6.226200 3.568560
## 5 1.792480 4.703360 -7.946680 0.751526 1.377980 -2.688510 -4.034530
## 6 -2.985950 1.831450 -1.242710 -3.477020 0.343388 1.113880 0.681361
## PC257 PC258 PC259 PC260 PC261 PC262 PC263 PC264
## 1 10.217000 2.798690 -0.20404 6.00096 -4.395450 -1.74089 14.85330 -4.464930
## 2 -5.732260 8.785600 2.05873 -8.80900 -4.774060 5.05502 -6.41775 2.170160
## 3 3.337810 0.125902 2.01051 -0.93745 -1.243950 -4.07192 4.38801 -3.298350
## 4 0.388167 -8.616340 4.53585 7.27976 5.417960 -1.73350 -10.24480 0.530276
## 5 -9.155030 2.792100 7.37488 1.69575 4.771990 1.53410 2.85344 1.453960
## 6 0.231469 2.879500 3.14342 -2.24327 -0.187722 1.39280 -1.54585 1.954860
## PC265 PC266 PC267 PC268 PC269 PC270 PC271
## 1 -4.601270 -3.632880 -3.420580 5.601980 1.73918 -5.757820 2.518890
## 2 6.914540 0.854455 0.214313 -10.811600 -6.30173 4.775130 3.610490
## 3 0.868803 4.652550 3.155700 5.374590 9.81615 -5.929760 3.294340
## 4 -4.331380 4.627150 2.329690 3.149330 3.14202 -0.235314 -1.078220
## 5 -3.228580 5.899340 1.818200 -5.714860 -3.46241 4.436150 0.795724
## 6 1.176700 -1.463310 0.316981 -0.110699 1.28764 -0.200808 -0.280158
## PC272 PC273 PC274 PC275 PC276 PC277 PC278
## 1 3.58193 -1.168390 -7.07120 -5.224730 4.137620 -3.408510 0.6219860
## 2 1.40319 4.402740 -1.38285 5.955280 -1.589800 6.596900 1.9750600
## 3 1.20705 0.038381 3.11563 -0.960092 -4.378120 -3.693600 -2.0753300
## 4 3.74612 -3.355080 7.15358 2.592350 1.633200 -2.412180 -1.5571100
## 5 -2.29567 -0.104475 -7.45907 9.908510 -1.999400 -0.583372 4.1093100
## 6 2.20110 1.867570 1.26245 0.773451 0.311616 4.005320 0.0238728
## PC279 PC280 PC281 PC282 PC283 PC284 PC285
## 1 -1.562260 0.105872 2.65067 -5.282260 3.888830 6.639730 2.11171
## 2 4.835510 -11.322300 1.21322 8.154130 -0.713716 -11.304700 0.91799
## 3 -0.660865 -2.375420 -8.14682 1.242340 -3.303890 0.130936 -1.10911
## 4 -5.694880 1.849500 -5.62426 -3.506870 -7.473330 -2.188800 3.03525
## 5 2.782680 0.441087 3.37600 -3.056790 1.136250 -1.395860 3.68252
## 6 1.530130 0.167205 -3.16492 -0.357814 3.673470 -0.379943 2.53848
## PC286 PC287 PC288 PC289 PC290 PC291 PC292 PC293
## 1 0.71662 9.05454 -2.03840 -2.645260 1.50376 -5.347100 -2.48426 -2.093320
## 2 -3.15564 2.24579 0.38665 5.177570 2.26573 3.246440 -2.74579 -2.137190
## 3 7.10491 -4.28393 1.98677 4.002980 -11.40990 1.584130 1.04011 -3.420510
## 4 -3.13498 -5.84941 -2.84623 4.880940 -1.27230 -3.360200 -2.12749 -0.347738
## 5 -3.53494 -5.27309 -4.49456 -0.379730 2.58931 1.678010 4.44095 -2.886450
## 6 2.71164 -0.55583 1.83435 0.397969 -1.69690 0.834418 -1.44886 0.683888
## PC294 PC295 PC296 PC297 PC298 PC299 PC300
## 1 0.564938 1.189270 -0.215717 -0.193309 2.52496000 0.8311710 1.86713
## 2 6.713890 -7.921350 -1.458520 1.922100 -7.48041000 -2.1499600 2.99386
## 3 -9.700750 2.588300 -2.378510 1.820310 -3.67372000 2.3447900 -2.23979
## 4 -7.489220 3.631760 4.168840 -7.412740 4.88316000 -0.0996591 1.85516
## 5 5.056590 0.926498 4.534130 2.484990 -1.72370000 1.3050700 5.15153
## 6 -0.282787 0.195923 -1.251030 -0.407879 -0.00140079 -1.1553700 -1.36337
## PC301 PC302 PC303 PC304 PC305 PC306 PC307
## 1 -1.234080 2.0990600 2.60633 2.773910 -0.387222 0.964290 5.434160
## 2 -2.612950 -2.7583700 -2.67752 -7.401700 -1.346430 -0.610682 8.473670
## 3 5.925480 1.2315400 -3.44371 4.759820 0.703588 -0.269782 -15.896500
## 4 1.952900 -1.7522500 1.93416 -6.931950 2.717090 -2.069920 3.515050
## 5 2.041660 -2.4227700 -3.11817 4.056730 0.690360 1.789960 -6.371680
## 6 0.422025 0.0439704 1.50440 0.493713 0.820934 1.086450 0.857018
## PC308 PC309 PC310 PC311 PC312 PC313 PC314
## 1 -3.098120 -3.577660 2.748940 -4.349740 -7.371410 2.114720 -2.32059
## 2 1.648540 3.196700 -1.181040 4.646540 4.186800 2.591040 2.10931
## 3 0.383344 5.649710 3.279960 -0.872005 3.648800 -3.890600 3.25618
## 4 0.189916 -6.446400 -2.835460 1.853950 1.330510 0.751542 5.14348
## 5 1.255220 2.257720 0.224074 0.687638 6.334960 -2.992030 -2.83224
## 6 0.422100 0.529036 -0.607926 -1.915980 0.523956 -0.876913 1.53350
## PC315 PC316 PC317 PC318 PC319 PC320 PC321
## 1 -3.675210 -2.53009 -1.3628700 -0.816044 0.965601 1.595910 -1.597450
## 2 -0.240234 2.68239 1.1998100 1.707910 2.424430 0.261341 2.945760
## 3 12.391500 2.31951 0.5746240 0.485511 1.644410 -0.387439 -3.099530
## 4 -1.987160 1.96134 1.4255500 3.832950 5.131330 -1.246780 -3.710480
## 5 1.322460 -1.84151 0.6604480 -4.535690 0.832546 1.287680 5.658290
## 6 -0.469643 1.22542 0.0611388 -2.910350 2.829690 -0.379044 -0.279165
## PC322 PC323 PC324 PC325 PC326 PC327 PC328
## 1 0.8446990 -1.2601100 -1.548870 0.737232 -0.643133 -0.0852386 5.871520
## 2 -2.2525900 0.1277170 3.915310 -1.367510 2.979610 1.7407300 -6.253270
## 3 2.7117600 -2.8954100 -2.098750 -0.476947 -8.409460 1.9182000 0.168238
## 4 -2.4577700 -3.2587100 -4.218530 0.806566 2.068080 -2.1328800 -3.106070
## 5 -0.0989765 2.1089900 1.169240 0.673200 -4.267080 -3.0959200 3.019690
## 6 -1.8082000 -0.0114823 -0.896652 -0.832268 -1.726370 -0.4165190 0.163041
## PC329 PC330 PC331 PC332 PC333 PC334 PC335
## 1 -1.088990 -1.756580 -0.701371 -2.991690 0.6879650 0.930764 -0.257736
## 2 0.781478 1.976890 -1.213700 5.248490 -0.0620777 -2.575750 2.267160
## 3 -3.915400 -3.339350 8.799000 -4.349520 -3.3202600 0.495733 1.754500
## 4 0.835147 -3.391560 1.137320 -2.630220 -2.4563400 2.532180 1.203710
## 5 0.540620 -1.730640 3.542550 -0.535833 -1.6422200 -2.035480 -0.953724
## 6 -0.287095 -0.427652 -0.331723 1.331000 -2.6373400 0.848775 -1.714910
## PC336 PC337 PC338 PC339 PC340 PC341 PC342
## 1 0.6875470 -1.504110 -4.933220 2.79098000 -0.456908 1.103500 1.736470
## 2 1.8304400 -3.453110 -0.110826 -0.17779200 -0.349543 2.316480 -1.508220
## 3 -4.4836100 5.333230 3.749290 -7.12486000 1.041590 0.356310 4.121790
## 4 -2.4055200 6.315770 3.467650 -3.94431000 0.897429 0.896957 4.396440
## 5 -0.0209294 0.525082 0.703207 -1.84145000 1.187370 3.324250 -0.712114
## 6 -0.8593910 0.369210 -1.043290 0.00984732 -0.736136 0.893229 -1.286420
## PC343 PC344 PC345 PC346 PC347 PC348 PC349
## 1 -1.308440 -0.6600610 -0.523703 -0.481355 0.2894440 0.555324 1.4311300
## 2 0.511227 3.4114900 0.161255 0.745598 0.0996532 -1.644380 0.2562320
## 3 1.052670 -2.7426300 2.599600 -3.188650 1.2340800 0.313397 0.6206480
## 4 1.598050 -2.9235100 1.163500 0.296651 0.2258680 2.011860 -0.0119385
## 5 -1.083300 1.4810000 2.135510 1.704580 0.0463003 -0.410127 1.9216500
## 6 -1.371240 0.0143467 1.415520 -0.997524 0.6229410 0.527758 -1.1511600
## PC350 PC351 PC352 PC353 PC354 PC355 PC356
## 1 -1.187760 -1.471570 0.951430 1.0479300 -0.6613750 1.240350 -0.364252
## 2 -1.234300 -2.257950 0.229619 -2.2023700 0.2749780 0.317993 0.128765
## 3 1.251360 2.910630 2.486170 0.0110691 -0.3021180 -2.802150 -2.401880
## 4 2.997510 1.222080 0.370503 -0.8696940 -0.0368718 -0.761892 2.039000
## 5 0.410518 2.134180 2.075850 -2.3572400 -2.1342800 -1.161670 1.889420
## 6 0.565100 -0.519103 -0.117861 0.1862110 -0.3867060 -0.198598 -1.169090
## PC357 PC358 PC359 PC360 PC361 PC362 PC363
## 1 -0.112586 -0.143726 -1.008250 0.144736 -1.485150 0.1037920 1.3313500
## 2 1.008790 0.949393 0.241149 -0.445602 0.954378 0.0632685 1.0416500
## 3 0.763325 0.409146 0.524146 1.952540 0.606963 0.7514830 -2.9797500
## 4 -0.558983 2.679460 -2.167780 -0.732539 -0.675922 -0.9072070 -0.6706960
## 5 1.178090 -0.573516 -0.796475 1.192490 0.432924 0.5788000 -0.0684301
## 6 0.819630 0.660935 0.199208 -0.436236 1.564780 0.1298340 0.5374120
## PC364 PC365 PC366 PC367 PC368 PC369 PC370
## 1 -0.5765340 0.316758 0.8014130 -1.311600 0.110141 -0.169821 -0.1184940
## 2 2.0041600 -0.127861 -0.6567490 0.737087 0.292659 -0.633115 -1.1396400
## 3 -0.0493784 1.381930 0.0566551 0.660984 -0.106757 1.227650 0.9722670
## 4 -1.5080300 1.371850 -0.9336160 -0.298698 0.875987 -0.206142 -0.0666218
## 5 1.3807900 -0.152460 1.8996100 -0.258265 0.295618 -0.102504 0.8708480
## 6 0.4096030 2.308110 1.1630400 1.064840 1.290890 0.892680 -0.8948120
## PC371 PC372 PC373 PC374 PC375 PC376 PC377
## 1 1.753720 0.7536190 -0.9861090 -0.0837764 0.221938 0.8388720 0.4011660
## 2 -0.411188 -0.0697471 0.0792463 -0.2388850 -0.321659 -0.0866674 0.8958640
## 3 -0.384097 -0.8513750 -1.2521100 0.6224400 0.533015 -0.4352070 -0.4293010
## 4 -0.413918 0.1669300 0.8929350 0.8196020 -0.244738 0.0180297 1.1780600
## 5 -0.521419 0.6016860 -0.8922910 0.7327500 -0.105591 -0.9743870 -0.0347993
## 6 -1.201230 1.0381100 0.2839790 0.3053520 1.103960 0.0276012 -1.5053800
## PC378 PC379 PC380 PC381 PC382 PC383 PC384
## 1 -0.752256 0.9752960 0.490820 -0.269417 0.0873277 -0.344573 0.5518270
## 2 -0.433737 -0.7524940 -1.295260 0.787757 0.1983390 0.787952 0.5065750
## 3 1.652110 -0.4329430 -1.482010 -0.439548 -0.9391980 -0.613826 -0.3415560
## 4 2.021730 0.0309600 0.442001 -1.009450 -0.3758430 -0.363921 0.0431491
## 5 -0.203470 -0.0618406 0.276317 -0.148439 -1.1356300 0.435455 -0.6924000
## 6 -1.995460 -0.0354098 1.959150 -1.531380 0.1726200 -3.652670 -0.6434560
## PC385 PC386 PC387 PC388 PC389 PC390 PC391
## 1 1.1478700 -0.4547620 0.0695679 0.7118490 -0.0563128 -0.909940 0.1570580
## 2 -0.3593430 -0.2851090 0.1277660 -0.0341109 -0.2948390 0.572535 -0.8686000
## 3 -1.5548700 0.8800530 -1.0985500 -0.0135044 0.3580010 0.270366 0.0918381
## 4 -0.0427509 0.0361481 0.4904240 -0.1437860 -0.2652720 -0.952677 0.5146400
## 5 -0.7842270 -0.4474540 0.2314990 -0.1420690 -0.1566840 -0.509840 -0.3599460
## 6 0.9404590 0.9628650 0.9784150 -7.9739200 -0.2331820 3.581700 6.2998400
## PC392 PC393 PC394 PC395 PC396 PC397 PC398
## 1 0.2235760 -1.090420 0.665178 0.2150070 0.182539 -0.0245521 -0.409320
## 2 -0.2364810 2.009490 0.089354 0.2188290 0.308847 0.2723120 0.419757
## 3 -0.4665380 0.720738 -0.390695 0.0510920 -0.680924 -0.1871150 0.263301
## 4 -0.0669153 -0.814804 0.167653 0.0271752 -0.122281 -0.4027950 -0.103524
## 5 -0.3461400 1.006380 -0.106034 -0.1872470 0.221418 0.4172700 0.263187
## 6 2.5088700 -23.936100 10.354700 -4.6332800 -1.045430 3.0284500 -2.604380
## PC399 PC400 PC401 PC402 PC403 PC404 PC405
## 1 0.416789 0.08096230 -0.0882664 -0.0917385 -0.238391 -0.0383467 -0.2732210
## 2 -0.392413 0.37890400 -0.0111192 0.4267590 0.354450 0.0803328 0.2785690
## 3 -0.691811 -0.29350900 -0.6236050 0.0606145 -0.515652 -0.0404215 -0.2139190
## 4 -0.142778 -0.00488842 0.3950300 0.5644640 -0.203194 -0.1242060 0.2796070
## 5 0.594169 0.34366200 0.1438910 -0.8060120 -0.326775 -0.0368202 0.0674722
## 6 0.638152 2.54147000 6.8458700 0.6471980 0.150438 -0.6151740 0.1941910
## PC406 PC407 PC408 PC409 Individual region
## 1 -0.3299910 0.0876557 -0.15296900 6.71901e-07 801 Southern Europe
## 2 -0.0657682 -0.2416440 -0.00682686 6.71901e-07 802 Southern Europe
## 3 0.2560050 0.3008330 0.20468500 6.71901e-07 803 Southern Europe
## 4 -0.0575973 0.0322624 -0.04173880 6.71901e-07 804 Southern Europe
## 5 -0.0567106 0.0126253 -0.04804520 6.71901e-07 805 Southern Europe
## 6 0.5064900 0.5506520 0.04508820 6.71901e-07 806 Southern Europe
## Pop_City Country Latitude Longitude Continent Year Region
## 1 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 2 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 3 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 4 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 5 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 6 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## Subregion order
## 1 East Europe 25
## 2 East Europe 25
## 3 East Europe 25
## 4 East Europe 25
## 5 East Europe 25
## 6 East Europe 25
# source the plotting function
source(
here(
"analyses", "my_theme2.R"
)
)
good.shapes = c(1:25,35:38,1:25)
colors2 <-
c("#E69F00",
"#799d10",
"#0072B2",
"yellow3",
"#B22222",
"#808080",
"#21a708",
"#FF7F00",
"#52ef99",
"#8E8BFF",
"#F781BF",
"#8B008B",
"purple",
"#2524f9",
"#E7297A",
"#1E90FF",
"chocolate4")
# make plot by continent and range
ggplot(merged_data, aes(PC1, PC2)) +
geom_point(aes(color = Country, shape = Country), size = 1) +
xlab(paste0("PC1 (", perc[1], " Variance)")) +
ylab(paste0("PC2 (", perc[2], " Variance)")) +
labs(
caption = "PCA with 17,028 SNPs of 409 mosquitoes from 41 localities in Europe."
) +
guides(
color = guide_legend(title = "Country", ncol = 3),
shape = guide_legend(title = "Country", ncol = 3),
fill = guide_legend(title = "Region", ncol = 1)
) +
stat_ellipse(aes(fill = Region, group = Region), geom = "polygon", alpha = 0.2, level = 0.8) +
scale_color_manual(values = colors2) +
scale_shape_manual(values=good.shapes) +
my_theme() +
theme(
plot.caption = element_text(face = "italic"),
legend.position = "top",
legend.justification = "top",
legend.box.just = "center",
legend.box.background = element_blank(),
plot.margin = margin(5.5, 25, 5.5, 5.5, "points"),
legend.margin = margin(10,10,10,10)
)
ggsave(
here(
"output", "europe", "lea", "PCA_lea_pc1_pc2_r01.pdf"
),
width = 8,
height = 6,
units = "in"
)
PC1 and PC3
# source the plotting function
source(
here(
"analyses", "my_theme2.R"
)
)
good.shapes = c(1:25,35:38,1:25)
colors2 <-
c("#E69F00",
"#799d10",
"#0072B2",
"yellow3",
"#B22222",
"#808080",
"#21a708",
"#FF7F00",
"#52ef99",
"#8E8BFF",
"#F781BF",
"#8B008B",
"purple",
"#2524f9",
"#E7297A",
"#1E90FF",
"chocolate4")
# make plot by continent and range
ggplot(merged_data, aes(PC1, PC3)) +
geom_point(aes(color = Country, shape = Country), size = 1) +
xlab(paste0("PC1 (", perc[1], " Variance)")) +
ylab(paste0("PC3 (", perc[3], " Variance)")) +
labs(
caption = "PCA with 17,028 SNPs of 409 mosquitoes from 41 localities in Europe."
) +
guides(
color = guide_legend(title = "Country", ncol = 3),
shape = guide_legend(title = "Country", ncol = 3),
fill = guide_legend(title = "Region", ncol = 1)
) +
stat_ellipse(aes(fill = Region, group = Region), geom = "polygon", alpha = 0.2, level = 0.8) +
scale_color_manual(values = colors2) +
scale_shape_manual(values=good.shapes) +
my_theme() +
theme(
plot.caption = element_text(face = "italic"),
legend.position = "top",
legend.justification = "top",
legend.box.just = "center",
legend.box.background = element_blank(),
plot.margin = margin(5.5, 25, 5.5, 5.5, "points"),
legend.margin = margin(10,10,10,10)
)
# # save the pca plot ####
ggsave(
here(
"output", "europe", "lea", "PCA_lea_pc1_pc3_r01.pdf"
),
width = 8,
height = 6,
units = "in"
)
PC1 and PC4
# source the plotting function
source(
here(
"analyses", "my_theme2.R"
)
)
good.shapes = c(1:25,35:38,1:25)
colors2 <-
c("#E69F00",
"#799d10",
"#0072B2",
"yellow3",
"#B22222",
"#808080",
"#21a708",
"#FF7F00",
"#52ef99",
"#8E8BFF",
"#F781BF",
"#8B008B",
"purple",
"#2524f9",
"#E7297A",
"#1E90FF",
"chocolate4")
# make plot by continent and range
ggplot(merged_data, aes(PC1, PC4)) +
geom_point(aes(color = Country, shape = Country), size = 1) +
xlab(paste0("PC1 (", perc[1], " Variance)")) +
ylab(paste0("PC4 (", perc[4], " Variance)")) +
labs(
caption = "PCA with 17,028 SNPs of 409 mosquitoes from 41 localities in Europe."
) +
guides(
color = guide_legend(title = "Country", ncol = 3),
shape = guide_legend(title = "Country", ncol = 3),
fill = guide_legend(title = "Region", ncol = 1)
) +
stat_ellipse(aes(fill = Region, group = Region), geom = "polygon", alpha = 0.2, level = 0.8) +
scale_color_manual(values = colors2) +
scale_shape_manual(values=good.shapes) +
my_theme() +
theme(
plot.caption = element_text(face = "italic"),
legend.position = "top",
legend.justification = "top",
legend.box.just = "center",
legend.box.background = element_blank(),
plot.margin = margin(5.5, 25, 5.5, 5.5, "points"),
legend.margin = margin(10,10,10,10)
)
We will do 5 repetitions
# set output dir
# main options
# K = number of ancestral populations
# entropy = TRUE computes the cross-entropy criterion, # CPU = 4 is the number of CPU used (hidden input) project = NULL
project = snmf(
genotype,
K = 1:20,
project = "new",
repetitions = 5,
CPU = 4,
entropy = TRUE
)
project = load.snmfProject("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmfProject")
Cross entropy
# Open a new pdf file
pdf(here("output","europe","lea","lea_cross_entropy_europe_r01.pdf"), width = 6, height = 4)
# Create your plot
plot(project, col = "blue", pch = 19, cex = 1.2)
# Close the pdf file
dev.off()
## png
## 2
replace k15 with whatever is best
Default plot
# select the best run for K = 15 clusters
best = which.min(cross.entropy(project, K = 15))
# best is run 3
barchart(project, K = 15, run = best,
border = NA, space = 0,
col = colors2,
xlab = "Individuals",
ylab = "Ancestry proportions",
main = "Ancestry matrix") -> bp
axis(1, at = 1:length(bp$order),
labels = bp$order, las=1,
cex.axis = .4)
Mean admixture by country using ggplot
color_palette <-
c(
"#008080",
"purple4",
"yellow2",
"#B22222",
"blue",
"#FFB347",
"green",
"#FFFF99",
"#F49AC2",
"purple",
"#B20CD9",
"#1E90FF",
"chocolate4",
"#77DD77",
"#FF8C1A")
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 15, run = best))
# Create a named vector to map countries to regions
country_to_region <- c(
"France" = "1-Western Europe",
"Portugal" = "2-Southern Europe",
"Spain" = "2-Southern Europe",
"Italy" = "2-Southern Europe",
"Malta" = "2-Southern Europe",
"Slovenia" = "2-Southern Europe",
"Croatia" = "2-Southern Europe",
"Albania" = "2-Southern Europe",
"Serbia" = "2-Southern Europe",
"Greece" = "2-Southern Europe",
"Romania" = "3-Eastern Europe",
"Bulgaria" = "3-Eastern Europe",
"Turkey" = "3-Eastern Europe",
"Ukraine" = "3-Eastern Europe",
"Russia" = "3-Eastern Europe",
"Georgia" = "3-Eastern Europe",
"Armenia" = "3-Eastern Europe"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here(
"analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=15.") +
scale_fill_manual(values = color_palette) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) # Remove Region prefix from labels
# save the pca plot ####
ggsave(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/LEA_k=15_r01_countries.pdf"
),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
## [1] 2
Using ggplot2 for individual admixtures
leak15 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K15/run2/r2_0.01_r2.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak15)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0000999 0.0161 0.357 1.70e-1 7.80e-2 3.48e-1 5.69e-3 9.99e-5 2.40e-3
## 2 0.0000999 0.0374 0.0700 2.30e-1 9.99e-5 6.56e-1 9.99e-5 9.99e-5 9.99e-5
## 3 0.0000999 0.0000999 0.0000999 9.99e-1 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.00762 0.0471 0.357 9.99e-5 8.61e-3 5.02e-1 9.99e-5 9.99e-5 9.99e-5
## 5 0.0106 0.000936 0.445 9.99e-5 5.98e-2 4.22e-1 3.20e-2 9.99e-5 1.66e-2
## 6 0.0000999 0.0000999 0.279 1.92e-1 1.01e-1 3.34e-1 9.99e-5 9.99e-5 9.99e-5
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 9.99370e-05 1.61473e-02 3.56810e-01 0.169823000 7.79935e-02
## 2 1066 SOC 9.99101e-05 3.73527e-02 7.00235e-02 0.230170000 9.99101e-05
## 3 1067 SOC 9.98737e-05 9.98737e-05 9.98737e-05 0.998602000 9.98737e-05
## 4 1068 SOC 7.62238e-03 4.70893e-02 3.56926e-01 0.000099937 8.60978e-03
## 5 1069 SOC 1.05570e-02 9.35863e-04 4.44524e-01 0.000099937 5.97901e-02
## 6 1070 SOC 9.99460e-05 9.99460e-05 2.78666e-01 0.191593000 1.00813e-01
## X6 X7 X8 X9 X10 X11
## 1 3.48388e-01 5.68796e-03 9.99370e-05 2.40282e-03 9.99370e-05 9.99370e-05
## 2 6.55789e-01 9.99101e-05 9.99101e-05 9.99101e-05 5.66503e-03 9.99101e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 5.02351e-01 9.99370e-05 9.99370e-05 9.99370e-05 9.99370e-05 9.99370e-05
## 5 4.22047e-01 3.20259e-02 9.99370e-05 1.66026e-02 9.99370e-05 9.99370e-05
## 6 3.34378e-01 9.99460e-05 9.99460e-05 9.99460e-05 1.74610e-02 1.33452e-02
## X12 X13 X14 X15
## 1 9.99370e-05 9.99370e-05 2.20478e-02 9.99370e-05
## 2 9.99101e-05 9.99101e-05 9.99101e-05 9.99101e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 3.12340e-02 9.99370e-05 2.26001e-02 2.28682e-02
## 5 9.99370e-05 9.99370e-05 9.99370e-05 1.28186e-02
## 6 9.99460e-05 5.59757e-02 1.03858e-03 6.12966e-03
Rename the columns
# Rename the columns starting from the third one
leak15 <- leak15 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak15)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 9.99370e-05 1.61473e-02 3.56810e-01 0.169823000 7.79935e-02
## 2 1066 SOC 9.99101e-05 3.73527e-02 7.00235e-02 0.230170000 9.99101e-05
## 3 1067 SOC 9.98737e-05 9.98737e-05 9.98737e-05 0.998602000 9.98737e-05
## 4 1068 SOC 7.62238e-03 4.70893e-02 3.56926e-01 0.000099937 8.60978e-03
## 5 1069 SOC 1.05570e-02 9.35863e-04 4.44524e-01 0.000099937 5.97901e-02
## 6 1070 SOC 9.99460e-05 9.99460e-05 2.78666e-01 0.191593000 1.00813e-01
## v6 v7 v8 v9 v10 v11
## 1 3.48388e-01 5.68796e-03 9.99370e-05 2.40282e-03 9.99370e-05 9.99370e-05
## 2 6.55789e-01 9.99101e-05 9.99101e-05 9.99101e-05 5.66503e-03 9.99101e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 5.02351e-01 9.99370e-05 9.99370e-05 9.99370e-05 9.99370e-05 9.99370e-05
## 5 4.22047e-01 3.20259e-02 9.99370e-05 1.66026e-02 9.99370e-05 9.99370e-05
## 6 3.34378e-01 9.99460e-05 9.99460e-05 9.99460e-05 1.74610e-02 1.33452e-02
## v12 v13 v14 v15
## 1 9.99370e-05 9.99370e-05 2.20478e-02 9.99370e-05
## 2 9.99101e-05 9.99101e-05 9.99101e-05 9.99101e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 3.12340e-02 9.99370e-05 2.26001e-02 2.28682e-02
## 5 9.99370e-05 9.99370e-05 9.99370e-05 1.28186e-02
## 6 9.99460e-05 5.59757e-02 1.03858e-03 6.12966e-03
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak15 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$ountry == rc))) + 0.5 # Shift borders to the right edge of the bars
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
## Warning: Unknown or uninitialised column: `ountry`.
## Warning in max(which(Q_ordered$ountry == rc)): no non-missing arguments to max;
## returning -Inf
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple4",
"#FFFF99",
"blue",
"chocolate4",
"#77DD77",
"#1E90FF",
"yellow",
"#FFB347",
"#B22222",
"green",
"#B20CD9",
"#FF8C1A",
"#F49AC2",
"#008080",
"navy"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n LEA inference for k15 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
ggsave(
here("output", "europe", "lea", "lea_k=15_europe_r2_01_run2_final.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Try K=20, for r<0.01, for comparison
Find the best run
## [1] 2
# Extract ancestry coefficients
leak20 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K20/run2/r2_0.01_r2.20.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak20)
## # A tibble: 6 × 20
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.99e-5 3.19e-2 9.99e-5 9.99e-5 9.99e-5 1.44e-2 9.99e-5 0.317 6.43e-2 6.74e-3
## 2 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 0.373 9.99e-5 9.99e-5
## 3 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 0.0148 9.98e-5 9.98e-5
## 4 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 1.21e-3 9.99e-5 0.202 9.11e-4 9.99e-5
## 5 2.14e-3 9.99e-5 6.28e-2 9.99e-5 9.99e-5 5.13e-3 2.63e-4 0.0480 3.97e-2 1.01e-2
## 6 9.99e-5 4.23e-4 2.49e-2 4.49e-3 1.07e-2 9.99e-5 9.99e-5 0.0543 9.92e-2 9.75e-3
## # ℹ 10 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## # X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 9.99010e-05 3.18942e-02 9.99010e-05 9.99010e-05 9.99010e-05
## 2 1066 SOC 9.98560e-05 9.98560e-05 9.98560e-05 9.98560e-05 9.98560e-05
## 3 1067 SOC 9.98376e-05 9.98376e-05 9.98376e-05 9.98376e-05 9.98376e-05
## 4 1068 SOC 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05
## 5 1069 SOC 2.14280e-03 9.99280e-05 6.27581e-02 9.99280e-05 9.99280e-05
## 6 1070 SOC 9.99460e-05 4.23386e-04 2.49194e-02 4.49292e-03 1.07251e-02
## X6 X7 X8 X9 X10 X11
## 1 1.43605e-02 9.99010e-05 0.3167410 6.42990e-02 6.74141e-03 9.99010e-05
## 2 9.98560e-05 9.98560e-05 0.3729030 9.98560e-05 9.98560e-05 9.98560e-05
## 3 9.98376e-05 9.98376e-05 0.0147977 9.98376e-05 9.98376e-05 9.98376e-05
## 4 1.21436e-03 9.99010e-05 0.2023060 9.10783e-04 9.99010e-05 1.33863e-02
## 5 5.12556e-03 2.62690e-04 0.0479641 3.96621e-02 1.00753e-02 9.99280e-05
## 6 9.99460e-05 9.99460e-05 0.0542638 9.91766e-02 9.74751e-03 9.99460e-05
## X12 X13 X14 X15 X16 X17
## 1 2.20468e-02 9.99010e-05 0.000099901 2.65770e-01 2.76414e-01 9.99010e-05
## 2 1.39859e-01 9.98560e-05 0.120439000 3.65201e-01 9.98560e-05 9.98560e-05
## 3 9.98376e-05 9.98376e-05 0.983405000 9.98376e-05 9.98376e-05 9.98376e-05
## 4 1.95174e-01 8.72494e-03 0.000099901 3.51139e-01 2.09608e-01 9.99010e-05
## 5 3.04398e-03 1.42962e-02 0.000099928 4.57943e-01 3.49473e-01 9.99280e-05
## 6 2.90642e-02 5.33012e-03 0.227621000 4.13131e-01 8.46080e-02 2.81484e-02
## X18 X19 X20
## 1 6.34219e-04 9.99010e-05 9.99010e-05
## 2 9.98560e-05 9.98560e-05 9.98560e-05
## 3 9.98376e-05 9.98376e-05 9.98376e-05
## 4 9.99010e-05 1.64380e-02 9.99010e-05
## 5 9.99280e-05 9.99280e-05 6.45405e-03
## 6 9.99460e-05 7.74950e-03 9.99460e-05
Rename the columns
# Rename the columns starting from the third one
leak20 <- leak20 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak20)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 9.99010e-05 3.18942e-02 9.99010e-05 9.99010e-05 9.99010e-05
## 2 1066 SOC 9.98560e-05 9.98560e-05 9.98560e-05 9.98560e-05 9.98560e-05
## 3 1067 SOC 9.98376e-05 9.98376e-05 9.98376e-05 9.98376e-05 9.98376e-05
## 4 1068 SOC 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05
## 5 1069 SOC 2.14280e-03 9.99280e-05 6.27581e-02 9.99280e-05 9.99280e-05
## 6 1070 SOC 9.99460e-05 4.23386e-04 2.49194e-02 4.49292e-03 1.07251e-02
## v6 v7 v8 v9 v10 v11
## 1 1.43605e-02 9.99010e-05 0.3167410 6.42990e-02 6.74141e-03 9.99010e-05
## 2 9.98560e-05 9.98560e-05 0.3729030 9.98560e-05 9.98560e-05 9.98560e-05
## 3 9.98376e-05 9.98376e-05 0.0147977 9.98376e-05 9.98376e-05 9.98376e-05
## 4 1.21436e-03 9.99010e-05 0.2023060 9.10783e-04 9.99010e-05 1.33863e-02
## 5 5.12556e-03 2.62690e-04 0.0479641 3.96621e-02 1.00753e-02 9.99280e-05
## 6 9.99460e-05 9.99460e-05 0.0542638 9.91766e-02 9.74751e-03 9.99460e-05
## v12 v13 v14 v15 v16 v17
## 1 2.20468e-02 9.99010e-05 0.000099901 2.65770e-01 2.76414e-01 9.99010e-05
## 2 1.39859e-01 9.98560e-05 0.120439000 3.65201e-01 9.98560e-05 9.98560e-05
## 3 9.98376e-05 9.98376e-05 0.983405000 9.98376e-05 9.98376e-05 9.98376e-05
## 4 1.95174e-01 8.72494e-03 0.000099901 3.51139e-01 2.09608e-01 9.99010e-05
## 5 3.04398e-03 1.42962e-02 0.000099928 4.57943e-01 3.49473e-01 9.99280e-05
## 6 2.90642e-02 5.33012e-03 0.227621000 4.13131e-01 8.46080e-02 2.81484e-02
## v18 v19 v20
## 1 6.34219e-04 9.99010e-05 9.99010e-05
## 2 9.98560e-05 9.98560e-05 9.98560e-05
## 3 9.98376e-05 9.98376e-05 9.98376e-05
## 4 9.99010e-05 1.64380e-02 9.99010e-05
## 5 9.99280e-05 9.99280e-05 6.45405e-03
## 6 9.99460e-05 7.74950e-03 9.99460e-05
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak20 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette20 <-
c(
"#77DD77",
"#F49AC2",
"#B20CD9",
"#FF8C1A",
"chocolate4",
"green4",
"#FFFF99",
"yellow",
"#B22222",
"orchid",
"gray",
"#008080",
"blue",
"#75FAFF",
"purple4",
"green",
"purple",
"#1E90FF",
"goldenrod3",
"#FFB347"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:20)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette20[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n LEA inference for k20 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette20) +
expand_limits(y = c(0, 1.5))
## [1] 2
# Extract ancestry coefficients
leak14 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K14/run2/r2_0.01_r2.14.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak14)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.99e-5 0.244 1.92e-3 9.99e-5 9.99e-5 3.27e-4 9.99e-5 8.54e-2 6.97e-2 1.74e-3
## 2 9.99e-5 0.378 9.99e-5 9.99e-5 2.45e-2 9.99e-5 9.99e-5 2.37e-2 9.99e-5 9.99e-5
## 3 9.99e-5 0.999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 2.82e-2 0.137 2.80e-2 6.37e-3 7.19e-2 1.00e-4 1.00e-4 3.81e-2 1.00e-4 1.06e-2
## 5 2.08e-2 0.0578 1.00e-4 1.85e-2 1.00e-4 2.16e-2 1.00e-4 7.61e-2 3.51e-3 1.00e-4
## 6 7.24e-3 0.259 5.76e-2 1.00e-4 1.00e-4 1.00e-4 7.38e-3 1.12e-1 5.01e-3 1.00e-4
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 9.99460e-05 0.2441980 1.92053e-03 9.99460e-05 9.99460e-05
## 2 1066 SOC 9.99191e-05 0.3777020 9.99191e-05 9.99191e-05 2.45387e-02
## 3 1067 SOC 9.98828e-05 0.9987020 9.98828e-05 9.98828e-05 9.98828e-05
## 4 1068 SOC 2.82305e-02 0.1369630 2.80010e-02 6.37437e-03 7.18853e-02
## 5 1069 SOC 2.08056e-02 0.0578426 9.99550e-05 1.85149e-02 9.99550e-05
## 6 1070 SOC 7.24219e-03 0.2586240 5.75541e-02 9.99550e-05 9.99550e-05
## X6 X7 X8 X9 X10 X11
## 1 3.27001e-04 9.99460e-05 8.54338e-02 6.97035e-02 1.74131e-03 9.99460e-05
## 2 9.99191e-05 9.99191e-05 2.36881e-02 9.99191e-05 9.99191e-05 9.99191e-05
## 3 9.98828e-05 9.98828e-05 9.98828e-05 9.98828e-05 9.98828e-05 9.98828e-05
## 4 9.99555e-05 9.99555e-05 3.81138e-02 9.99555e-05 1.05863e-02 9.99555e-05
## 5 2.15542e-02 9.99550e-05 7.61393e-02 3.50856e-03 9.99550e-05 5.69393e-02
## 6 9.99550e-05 7.38193e-03 1.12264e-01 5.00824e-03 9.99550e-05 2.80213e-02
## X12 X13 X14
## 1 5.65467e-01 3.06091e-02 9.99460e-05
## 2 5.33355e-01 3.98165e-02 9.99191e-05
## 3 9.98828e-05 9.98828e-05 9.98828e-05
## 4 6.42568e-01 3.67776e-02 9.99555e-05
## 5 7.22112e-01 2.20843e-02 9.99550e-05
## 6 5.07450e-01 9.99550e-05 1.59542e-02
Rename the columns
# Rename the columns starting from the third one
leak14 <- leak14 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak14)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 9.99460e-05 0.2441980 1.92053e-03 9.99460e-05 9.99460e-05
## 2 1066 SOC 9.99191e-05 0.3777020 9.99191e-05 9.99191e-05 2.45387e-02
## 3 1067 SOC 9.98828e-05 0.9987020 9.98828e-05 9.98828e-05 9.98828e-05
## 4 1068 SOC 2.82305e-02 0.1369630 2.80010e-02 6.37437e-03 7.18853e-02
## 5 1069 SOC 2.08056e-02 0.0578426 9.99550e-05 1.85149e-02 9.99550e-05
## 6 1070 SOC 7.24219e-03 0.2586240 5.75541e-02 9.99550e-05 9.99550e-05
## v6 v7 v8 v9 v10 v11
## 1 3.27001e-04 9.99460e-05 8.54338e-02 6.97035e-02 1.74131e-03 9.99460e-05
## 2 9.99191e-05 9.99191e-05 2.36881e-02 9.99191e-05 9.99191e-05 9.99191e-05
## 3 9.98828e-05 9.98828e-05 9.98828e-05 9.98828e-05 9.98828e-05 9.98828e-05
## 4 9.99555e-05 9.99555e-05 3.81138e-02 9.99555e-05 1.05863e-02 9.99555e-05
## 5 2.15542e-02 9.99550e-05 7.61393e-02 3.50856e-03 9.99550e-05 5.69393e-02
## 6 9.99550e-05 7.38193e-03 1.12264e-01 5.00824e-03 9.99550e-05 2.80213e-02
## v12 v13 v14
## 1 5.65467e-01 3.06091e-02 9.99460e-05
## 2 5.33355e-01 3.98165e-02 9.99191e-05
## 3 9.98828e-05 9.98828e-05 9.98828e-05
## 4 6.42568e-01 3.67776e-02 9.99555e-05
## 5 7.22112e-01 2.20843e-02 9.99550e-05
## 6 5.07450e-01 9.99550e-05 1.59542e-02
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak14 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette14 <-
c(
"#B22222",
"blue",
"green4",
"#FF8C1A",
"#FFB347",
"#F49AC2",
"#B20CD9",
"yellow",
"purple4",
"#008080",
"#1E90FF",
"green",
"chocolate4",
"#77DD77"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:14)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette14[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=14.\n LEA inference for k20 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette14) +
expand_limits(y = c(0, 1.5))
## [1] 5
leak16 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K16/run5/r2_0.01_r5.16.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak16)
## # A tibble: 6 × 16
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0760 0.333 0.00539 9.99e-5 9.99e-5 9.99e-5 4.47e-3 9.99e-5 3.97e-1
## 2 0.0000999 0.0403 0.0000999 9.99e-5 9.99e-5 1.76e-2 9.99e-5 9.32e-3 6.02e-1
## 3 0.0000999 0.0000999 0.0000999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.0192 0.355 0.0000999 2.02e-2 9.99e-5 9.99e-5 9.99e-5 4.57e-2 4.96e-1
## 5 0.0400 0.396 0.0308 1.89e-3 9.99e-5 9.99e-5 4.15e-2 9.99e-5 4.72e-1
## 6 0.101 0.151 0.0117 9.99e-5 9.99e-5 9.99e-5 3.09e-2 9.99e-5 4.40e-1
## # ℹ 7 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>, X16 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 7.59979e-02 3.32717e-01 5.39145e-03 9.99300e-05 9.99300e-05
## 2 1066 SOC 9.99100e-05 4.02712e-02 9.99100e-05 9.99100e-05 9.99100e-05
## 3 1067 SOC 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05
## 4 1068 SOC 1.91679e-02 3.54843e-01 9.99370e-05 2.02087e-02 9.99370e-05
## 5 1069 SOC 4.00484e-02 3.96251e-01 3.07573e-02 1.88516e-03 9.99279e-05
## 6 1070 SOC 1.01392e-01 1.51014e-01 1.16719e-02 9.99280e-05 9.99280e-05
## X6 X7 X8 X9 X10 X11
## 1 9.99300e-05 4.46826e-03 9.99300e-05 3.97036e-01 2.96146e-02 9.99300e-05
## 2 1.75523e-02 9.99100e-05 9.32071e-03 6.02226e-01 3.12881e-02 9.99100e-05
## 3 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05
## 4 9.99370e-05 9.99370e-05 4.57357e-02 4.95649e-01 2.71616e-02 9.99370e-05
## 5 9.99279e-05 4.14628e-02 9.99279e-05 4.71894e-01 9.99279e-05 5.42984e-03
## 6 9.99280e-05 3.09043e-02 9.99280e-05 4.39759e-01 9.99280e-05 9.99280e-05
## X12 X13 X14 X15 X16
## 1 1.46434e-01 9.99300e-05 9.99300e-05 6.81396e-03 8.26647e-04
## 2 2.98343e-01 9.99100e-05 9.99100e-05 9.99100e-05 9.99100e-05
## 3 9.98502e-01 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05
## 4 4.31377e-03 9.99370e-05 6.52538e-03 2.56952e-02 9.99370e-05
## 5 9.99279e-05 9.99279e-05 9.99279e-05 9.99279e-05 1.14722e-02
## 6 2.03919e-01 1.59104e-02 9.99280e-05 4.46287e-02 9.99280e-05
Rename the columns
# Rename the columns starting from the third one
leak16 <- leak16 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak16)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 7.59979e-02 3.32717e-01 5.39145e-03 9.99300e-05 9.99300e-05
## 2 1066 SOC 9.99100e-05 4.02712e-02 9.99100e-05 9.99100e-05 9.99100e-05
## 3 1067 SOC 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05
## 4 1068 SOC 1.91679e-02 3.54843e-01 9.99370e-05 2.02087e-02 9.99370e-05
## 5 1069 SOC 4.00484e-02 3.96251e-01 3.07573e-02 1.88516e-03 9.99279e-05
## 6 1070 SOC 1.01392e-01 1.51014e-01 1.16719e-02 9.99280e-05 9.99280e-05
## v6 v7 v8 v9 v10 v11
## 1 9.99300e-05 4.46826e-03 9.99300e-05 3.97036e-01 2.96146e-02 9.99300e-05
## 2 1.75523e-02 9.99100e-05 9.32071e-03 6.02226e-01 3.12881e-02 9.99100e-05
## 3 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05
## 4 9.99370e-05 9.99370e-05 4.57357e-02 4.95649e-01 2.71616e-02 9.99370e-05
## 5 9.99279e-05 4.14628e-02 9.99279e-05 4.71894e-01 9.99279e-05 5.42984e-03
## 6 9.99280e-05 3.09043e-02 9.99280e-05 4.39759e-01 9.99280e-05 9.99280e-05
## v12 v13 v14 v15 v16
## 1 1.46434e-01 9.99300e-05 9.99300e-05 6.81396e-03 8.26647e-04
## 2 2.98343e-01 9.99100e-05 9.99100e-05 9.99100e-05 9.99100e-05
## 3 9.98502e-01 9.98647e-05 9.98647e-05 9.98647e-05 9.98647e-05
## 4 4.31377e-03 9.99370e-05 6.52538e-03 2.56952e-02 9.99370e-05
## 5 9.99279e-05 9.99279e-05 9.99279e-05 9.99279e-05 1.14722e-02
## 6 2.03919e-01 1.59104e-02 9.99280e-05 4.46287e-02 9.99280e-05
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak16 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette16 <-
c(
"green",
"#77DD77",
"#FFFF99",
"#1E90FF",
"purple",
"#B22222",
"#B20CD9",
"green4",
"purple4",
"navy",
"chocolate4",
"#F49AC2",
"#008080",
"blue",
"yellow",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:16)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette16[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=16.\n LEA inference for k16 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette16) +
expand_limits(y = c(0, 1.5))
## [1] 5
leak18 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K18/run5/r2_0.01_r5.18.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak18)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.99e-5 9.99e-5 9.99e-5 3.57e-1 0.375 5.15e-3 3.43e-2 9.99e-5 5.99e-2 9.99e-5
## 2 9.99e-5 9.99e-5 9.99e-5 4.70e-2 0.358 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 3 9.99e-5 9.99e-5 9.99e-5 9.99e-5 0.0149 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 9.99e-5 9.99e-5 1.86e-2 3.23e-1 0.206 9.99e-5 9.99e-5 9.99e-5 7.48e-3 9.99e-5
## 5 9.99e-5 9.99e-5 9.99e-5 4.25e-1 0.0851 1.56e-2 9.99e-5 9.99e-5 3.34e-2 6.21e-4
## 6 3.64e-2 9.99e-5 4.00e-3 1.95e-1 0.0691 9.99e-5 9.24e-3 1.39e-2 9.75e-2 9.99e-5
## # ℹ 8 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>,
## # X16 <dbl>, X17 <dbl>, X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 9.98920e-05 9.98920e-05 9.98920e-05 3.56610e-01 0.3752300
## 2 1066 SOC 9.99010e-05 9.99010e-05 9.99010e-05 4.70472e-02 0.3580990
## 3 1067 SOC 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 0.0148745
## 4 1068 SOC 9.99190e-05 9.99190e-05 1.85653e-02 3.23290e-01 0.2056600
## 5 1069 SOC 9.99009e-05 9.99009e-05 9.99009e-05 4.25347e-01 0.0851131
## 6 1070 SOC 3.63928e-02 9.99370e-05 4.00021e-03 1.95500e-01 0.0691410
## X6 X7 X8 X9 X10 X11
## 1 5.14673e-03 3.42779e-02 9.98920e-05 5.98864e-02 9.98920e-05 9.98920e-05
## 2 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05
## 3 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05
## 4 9.99190e-05 9.99190e-05 9.99190e-05 7.48143e-03 9.99190e-05 4.40429e-02
## 5 1.56399e-02 9.99009e-05 9.99009e-05 3.34383e-02 6.20857e-04 9.99009e-05
## 6 9.99370e-05 9.23927e-03 1.38603e-02 9.75237e-02 9.99370e-05 9.99370e-05
## X12 X13 X14 X15 X16 X17
## 1 1.67649e-01 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05
## 2 4.58486e-01 2.86416e-02 6.48092e-04 9.99010e-05 9.99010e-05 4.05724e-03
## 3 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05
## 4 3.44084e-01 1.65582e-02 9.99190e-05 9.99190e-05 1.42503e-02 2.51683e-02
## 5 3.72512e-01 9.99009e-05 9.99009e-05 6.62296e-02 9.99009e-05 9.99009e-05
## 6 3.56064e-01 9.99370e-05 1.71571e-02 9.99370e-05 1.36600e-02 9.99370e-05
## X18
## 1 9.98920e-05
## 2 1.01921e-01
## 3 9.83528e-01
## 4 9.99190e-05
## 5 9.99009e-05
## 6 1.86762e-01
Rename the columns
# Rename the columns starting from the third one
leak18 <- leak18 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak18)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 9.98920e-05 9.98920e-05 9.98920e-05 3.56610e-01 0.3752300
## 2 1066 SOC 9.99010e-05 9.99010e-05 9.99010e-05 4.70472e-02 0.3580990
## 3 1067 SOC 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 0.0148745
## 4 1068 SOC 9.99190e-05 9.99190e-05 1.85653e-02 3.23290e-01 0.2056600
## 5 1069 SOC 9.99009e-05 9.99009e-05 9.99009e-05 4.25347e-01 0.0851131
## 6 1070 SOC 3.63928e-02 9.99370e-05 4.00021e-03 1.95500e-01 0.0691410
## v6 v7 v8 v9 v10 v11
## 1 5.14673e-03 3.42779e-02 9.98920e-05 5.98864e-02 9.98920e-05 9.98920e-05
## 2 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05 9.99010e-05
## 3 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05
## 4 9.99190e-05 9.99190e-05 9.99190e-05 7.48143e-03 9.99190e-05 4.40429e-02
## 5 1.56399e-02 9.99009e-05 9.99009e-05 3.34383e-02 6.20857e-04 9.99009e-05
## 6 9.99370e-05 9.23927e-03 1.38603e-02 9.75237e-02 9.99370e-05 9.99370e-05
## v12 v13 v14 v15 v16 v17
## 1 1.67649e-01 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05 9.98920e-05
## 2 4.58486e-01 2.86416e-02 6.48092e-04 9.99010e-05 9.99010e-05 4.05724e-03
## 3 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05 9.98557e-05
## 4 3.44084e-01 1.65582e-02 9.99190e-05 9.99190e-05 1.42503e-02 2.51683e-02
## 5 3.72512e-01 9.99009e-05 9.99009e-05 6.62296e-02 9.99009e-05 9.99009e-05
## 6 3.56064e-01 9.99370e-05 1.71571e-02 9.99370e-05 1.36600e-02 9.99370e-05
## v18
## 1 9.98920e-05
## 2 1.01921e-01
## 3 9.83528e-01
## 4 9.99190e-05
## 5 9.99009e-05
## 6 1.86762e-01
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak18 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette18 <-
c(
"#F49AC2",
"purple",
"green",
"#75FAFF",
"blue",
"#B20CD9",
"#008080",
"green4",
"chocolate4",
"#77DD77",
"#FFFF99",
"purple4",
"#FFB347",
"#B22222",
"#1E90FF",
"orchid",
"yellow",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette18[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n LEA inference for k18 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette18) +
expand_limits(y = c(0, 1.5))
## [1] 2
# Extract ancestry coefficients
leak13 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K13/run2/r2_0.01_r2.13.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak13)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.43e-1 1.00e-4 2.40e-2 1.00e-4 2.17e-2 1.00e-4 8.49e-3 0.269 1.16e-1 1.00e-4
## 2 5.60e-1 9.99e-5 9.99e-5 6.00e-4 4.32e-2 7.56e-3 1.42e-2 0.368 6.39e-3 9.99e-5
## 3 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 0.999 9.99e-5 9.99e-5
## 4 6.48e-1 1.00e-4 1.22e-2 1.74e-2 4.76e-2 1.00e-4 7.65e-2 0.134 4.63e-2 1.00e-4
## 5 7.27e-1 1.00e-4 1.00e-4 1.42e-2 2.06e-2 1.00e-4 4.11e-4 0.0685 9.73e-2 3.02e-2
## 6 5.10e-1 1.83e-2 4.53e-2 7.05e-3 1.00e-4 6.77e-3 1.00e-4 0.254 1.32e-1 2.12e-2
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 5.42700e-01 9.99550e-05 2.39979e-02 9.99550e-05 2.17132e-02
## 2 1066 SOC 5.59728e-01 9.99460e-05 9.99460e-05 5.99640e-04 4.32295e-02
## 3 1067 SOC 9.98918e-05 9.98918e-05 9.98918e-05 9.98918e-05 9.98918e-05
## 4 1068 SOC 6.47624e-01 9.99640e-05 1.22187e-02 1.73512e-02 4.76202e-02
## 5 1069 SOC 7.26633e-01 9.99640e-05 9.99640e-05 1.42205e-02 2.05775e-02
## 6 1070 SOC 5.10010e-01 1.82754e-02 4.53471e-02 7.04568e-03 9.99640e-05
## X6 X7 X8 X9 X10 X11
## 1 9.99550e-05 8.49048e-03 0.2686440 1.15660e-01 9.99550e-05 1.45766e-02
## 2 7.56064e-03 1.42134e-02 0.3676750 6.39420e-03 9.99460e-05 9.99460e-05
## 3 9.98918e-05 9.98918e-05 0.9988010 9.98918e-05 9.98918e-05 9.98918e-05
## 4 9.99640e-05 7.65264e-02 0.1342570 4.63046e-02 9.99640e-05 4.81630e-03
## 5 9.99640e-05 4.10980e-04 0.0685492 9.72608e-02 3.02222e-02 9.99640e-05
## 6 6.77253e-03 9.99640e-05 0.2542530 1.32208e-01 2.11997e-02 4.48785e-03
## X12 X13
## 1 3.71808e-03 9.99550e-05
## 2 9.99460e-05 9.99460e-05
## 3 9.98918e-05 9.98918e-05
## 4 9.99640e-05 1.28824e-02
## 5 2.24654e-02 1.92606e-02
## 6 9.99640e-05 9.99640e-05
Rename the columns
# Rename the columns starting from the third one
leak13 <- leak13 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak13)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 5.42700e-01 9.99550e-05 2.39979e-02 9.99550e-05 2.17132e-02
## 2 1066 SOC 5.59728e-01 9.99460e-05 9.99460e-05 5.99640e-04 4.32295e-02
## 3 1067 SOC 9.98918e-05 9.98918e-05 9.98918e-05 9.98918e-05 9.98918e-05
## 4 1068 SOC 6.47624e-01 9.99640e-05 1.22187e-02 1.73512e-02 4.76202e-02
## 5 1069 SOC 7.26633e-01 9.99640e-05 9.99640e-05 1.42205e-02 2.05775e-02
## 6 1070 SOC 5.10010e-01 1.82754e-02 4.53471e-02 7.04568e-03 9.99640e-05
## v6 v7 v8 v9 v10 v11
## 1 9.99550e-05 8.49048e-03 0.2686440 1.15660e-01 9.99550e-05 1.45766e-02
## 2 7.56064e-03 1.42134e-02 0.3676750 6.39420e-03 9.99460e-05 9.99460e-05
## 3 9.98918e-05 9.98918e-05 0.9988010 9.98918e-05 9.98918e-05 9.98918e-05
## 4 9.99640e-05 7.65264e-02 0.1342570 4.63046e-02 9.99640e-05 4.81630e-03
## 5 9.99640e-05 4.10980e-04 0.0685492 9.72608e-02 3.02222e-02 9.99640e-05
## 6 6.77253e-03 9.99640e-05 0.2542530 1.32208e-01 2.11997e-02 4.48785e-03
## v12 v13
## 1 3.71808e-03 9.99550e-05
## 2 9.99460e-05 9.99460e-05
## 3 9.98918e-05 9.98918e-05
## 4 9.99640e-05 1.28824e-02
## 5 2.24654e-02 1.92606e-02
## 6 9.99640e-05 9.99640e-05
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak13 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette13 <-
c(
"yellow",
"#FF8C1A",
"#1E90FF",
"#77DD77",
"#B20CD9",
"#F49AC2",
"chocolate4",
"#B22222",
"blue",
"green",
"#008080",
"purple4",
"green4"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette13[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n LEA inference for k13 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette13) +
expand_limits(y = c(0, 1.5))
## [1] 5
# Extract ancestry coefficients
leak6 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K6/run5/r2_0.01_r5.6.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak6)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.804 0.00343 0.0972 0.00590 0.0190 0.0703
## 2 0.925 0.000100 0.0592 0.00514 0.0103 0.000566
## 3 1.00 0.000100 0.000100 0.000100 0.000100 0.000100
## 4 0.757 0.0191 0.0966 0.0204 0.0569 0.0497
## 5 0.779 0.0398 0.159 0.0100 0.0107 0.00215
## 6 0.757 0.000100 0.0759 0.0347 0.000100 0.132
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1065 SOC 0.804195 0.003429300 0.09716880 0.00590210 0.018966800 0.070338300
## 2 1066 SOC 0.924653 0.000099991 0.05922170 0.00514278 0.010316600 0.000565636
## 3 1067 SOC 0.999500 0.000099960 0.00009996 0.00009996 0.000099960 0.000099960
## 4 1068 SOC 0.757393 0.019072000 0.09658790 0.02037480 0.056865700 0.049706600
## 5 1069 SOC 0.778649 0.039838100 0.15862000 0.01002310 0.010722500 0.002147300
## 6 1070 SOC 0.756830 0.000099982 0.07588540 0.03465280 0.000099982 0.132432000
Rename the columns
# Rename the columns starting from the third one
leak6 <- leak6 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak6)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1065 SOC 0.804195 0.003429300 0.09716880 0.00590210 0.018966800 0.070338300
## 2 1066 SOC 0.924653 0.000099991 0.05922170 0.00514278 0.010316600 0.000565636
## 3 1067 SOC 0.999500 0.000099960 0.00009996 0.00009996 0.000099960 0.000099960
## 4 1068 SOC 0.757393 0.019072000 0.09658790 0.02037480 0.056865700 0.049706600
## 5 1069 SOC 0.778649 0.039838100 0.15862000 0.01002310 0.010722500 0.002147300
## 6 1070 SOC 0.756830 0.000099982 0.07588540 0.03465280 0.000099982 0.132432000
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak6 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette6 <-
c(
"#77DD37",
"purple3",
"#1E90FF",
"#FFFF19",
"#FF8C1A",
"red")
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette6[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n LEA inference for k6 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette6) +
expand_limits(y = c(0, 1.5))
## [1] 5
Extract ancestry coefficients
# Extract ancestry coefficients
leak5 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K5/run5/r2_0.01_r5.5.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0188 0.125 0.807 0.0119 0.0370
## 2 0.00823 0.0629 0.925 0.00350 0.000100
## 3 0.000100 0.000100 1.00 0.000100 0.000100
## 4 0.0728 0.101 0.770 0.0211 0.0351
## 5 0.0216 0.150 0.783 0.0138 0.0314
## 6 0.000100 0.119 0.769 0.0356 0.0766
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 0.01879800 0.12493500 0.807368 0.01192510 0.036974400
## 2 1066 SOC 0.00823007 0.06291730 0.925257 0.00349554 0.000099991
## 3 1067 SOC 0.00009997 0.00009997 0.999600 0.00009997 0.000099970
## 4 1068 SOC 0.07279980 0.10095100 0.770038 0.02106580 0.035144600
## 5 1069 SOC 0.02159280 0.15040500 0.782771 0.01380250 0.031427900
## 6 1070 SOC 0.00009999 0.11857300 0.769123 0.03564650 0.076557900
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak5)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 0.01879800 0.12493500 0.807368 0.01192510 0.036974400
## 2 1066 SOC 0.00823007 0.06291730 0.925257 0.00349554 0.000099991
## 3 1067 SOC 0.00009997 0.00009997 0.999600 0.00009997 0.000099970
## 4 1068 SOC 0.07279980 0.10095100 0.770038 0.02106580 0.035144600
## 5 1069 SOC 0.02159280 0.15040500 0.782771 0.01380250 0.031427900
## 6 1070 SOC 0.00009999 0.11857300 0.769123 0.03564650 0.076557900
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette5 <-
c(
"purple3",
"#77DD37",
"#1E90FF",
"#FFFF19",
"#FF8C1A")
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette5[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n LEA inference for k5 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette5) +
expand_limits(y = c(0, 1.5))
## [1] 2
Extract ancestry coefficients
# Extract ancestry coefficients
leak4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K4/run2/r2_0.01_r2.4.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak4)
## # A tibble: 6 × 4
## X1 X2 X3 X4
## <dbl> <dbl> <dbl> <dbl>
## 1 0.0703 0.0756 0.0428 0.811
## 2 0.0528 0.000100 0.00877 0.938
## 3 0.000100 0.000100 0.000100 1.00
## 4 0.109 0.0436 0.0738 0.774
## 5 0.108 0.0589 0.0395 0.793
## 6 0.107 0.114 0.000100 0.779
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4
## 1 1065 SOC 0.07025030 0.075632900 0.042754200 0.811363
## 2 1066 SOC 0.05278330 0.000099991 0.008770000 0.938347
## 3 1067 SOC 0.00009998 0.000099980 0.000099980 0.999700
## 4 1068 SOC 0.10880000 0.043615000 0.073761000 0.773824
## 5 1069 SOC 0.10816000 0.058946000 0.039478900 0.793415
## 6 1070 SOC 0.10668000 0.114057000 0.000099991 0.779163
Rename the columns
# Rename the columns starting from the third one
leak4 <- leak4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak4)
## ind pop v1 v2 v3 v4
## 1 1065 SOC 0.07025030 0.075632900 0.042754200 0.811363
## 2 1066 SOC 0.05278330 0.000099991 0.008770000 0.938347
## 3 1067 SOC 0.00009998 0.000099980 0.000099980 0.999700
## 4 1068 SOC 0.10880000 0.043615000 0.073761000 0.773824
## 5 1069 SOC 0.10816000 0.058946000 0.039478900 0.793415
## 6 1070 SOC 0.10668000 0.114057000 0.000099991 0.779163
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette4 <-
c(
"purple3",
"#FF8C1A",
"#77DD37",
"#1E90FF")
# Generate all potential variable names
all_variables <- paste0("v", 1:4)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette4[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=4.\n LEA inference for k4 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette4) +
expand_limits(y = c(0, 1.5))
## [1] 5
Extract ancestry coefficients
# Extract ancestry coefficients
leak3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.snmf/K3/run5/r2_0.01_r5.3.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.0909 0.835 0.0743
## 2 0.0425 0.957 0.000100
## 3 0.000100 1.00 0.000100
## 4 0.151 0.799 0.0499
## 5 0.109 0.811 0.0798
## 6 0.0984 0.792 0.109
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3
## 1 1065 SOC 0.09087750 0.834777 0.074345300
## 2 1066 SOC 0.04245150 0.957449 0.000099991
## 3 1067 SOC 0.00009999 0.999800 0.000099990
## 4 1068 SOC 0.15131000 0.798747 0.049943000
## 5 1069 SOC 0.10881200 0.811341 0.079847100
## 6 1070 SOC 0.09835480 0.792389 0.109257000
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak3)
## ind pop v1 v2 v3
## 1 1065 SOC 0.09087750 0.834777 0.074345300
## 2 1066 SOC 0.04245150 0.957449 0.000099991
## 3 1067 SOC 0.00009999 0.999800 0.000099990
## 4 1068 SOC 0.15131000 0.798747 0.049943000
## 5 1069 SOC 0.10881200 0.811341 0.079847100
## 6 1070 SOC 0.09835480 0.792389 0.109257000
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette3 <-
c(
"#1E90FF",
"#FF8C1A",
"purple3"
)
# "#77DD37",
# Generate all potential variable names
all_variables <- paste0("v", 1:3)
# Create a data frame that pairs each potential variable with a color
olor_mapping <- data.frame(variable = all_variables,
color = color_palette3[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n LEA inference for k3 with 17,028 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette3) +
expand_limits(y = c(0, 1.5))
genotype <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.vcf"
)
d <- read.vcfR(
genotype
)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 47484
## column count: 418
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 47484
## Character matrix gt cols: 418
## skip: 0
## nrows: 47484
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant 21000
Processed variant 22000
Processed variant 23000
Processed variant 24000
Processed variant 25000
Processed variant 26000
Processed variant 27000
Processed variant 28000
Processed variant 29000
Processed variant 30000
Processed variant 31000
Processed variant 32000
Processed variant 33000
Processed variant 34000
Processed variant 35000
Processed variant 36000
Processed variant 37000
Processed variant 38000
Processed variant 39000
Processed variant 40000
Processed variant 41000
Processed variant 42000
Processed variant 43000
Processed variant 44000
Processed variant 45000
Processed variant 46000
Processed variant 47000
Processed variant: 47484
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ITB ITP ITR KER KRA
## 10 12 12 10 12 13 10 14 12 16 12 12 11 10 4 5 8 12 12 12
## MAL POL POP RAR ROM ROS SCH SER SEV SIC SLO SOC SPB SPC SPM SPS STS TIK TIR TRE
## 12 2 12 12 4 11 5 4 12 9 12 12 8 6 5 8 7 12 4 12
## TUA TUH
## 9 12
Convert format
##
## - number of detected individuals: 409
## - number of detected loci: 47484
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.removed file, for more informations.
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.geno"
##
## - number of detected individuals: 409
## - number of detected loci: 47484
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.removed file, for more informations.
##
##
## - number of detected individuals: 409
## - number of detected loci: 47484
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.lfmm"
PCA
## [1] "******************************"
## [1] " Principal Component Analysis "
## [1] "******************************"
## summary of the options:
##
## -n (number of individuals) 409
## -L (number of loci) 47484
## -K (number of principal components) 409
## -x (genotype file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.lfmm
## -a (eigenvalue file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.pca/r2_0.1.eigenvalues
## -e (eigenvector file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.pca/r2_0.1.eigenvectors
## -d (standard deviation file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.pca/r2_0.1.sdev
## -p (projection file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.pca/r2_0.1.projections
## -c data centered
## * pca class *
##
## project directory: /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/
## pca result directory: r2_0.1.pca/
## input file: r2_0.1.lfmm
## eigenvalue file: r2_0.1.eigenvalues
## eigenvector file: r2_0.1.eigenvectors
## standard deviation file: r2_0.1.sdev
## projection file: r2_0.1.projections
## pcaProject file: r2_0.1.pcaProject
## number of individuals: 409
## number of loci: 47484
## number of principal components: 409
## centered: TRUE
## scaled: FALSE
Test
## [1] "*******************"
## [1] " Tracy-Widom tests "
## [1] "*******************"
## summary of the options:
##
## -n (number of eigenvalues) 409
## -i (input file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.pca/r2_0.1.eigenvalues
## -o (output file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.pca/r2_0.1.tracywidom
# tw$pvalues
# plot the percentage of variance explained by each component
plot(tw$percentage, pch = 19, col = "blue", cex = .8)
Get values
# plot preparation
pc.coord <- as.data.frame(pc$projections)
colnames(pc.coord) <- paste0("PC", 1:nPC)
pc.coord$Individual <- inds
pc.coord$Population <- pops
# perc1 <- paste0(round(tw$percentage, digits = 3) * 100, "%")
perc <- paste0(round(pc$eigenvalues/sum(pc$eigenvalues), digits = 3) * 100, "%")
nb.cols <- 40
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
Check R symbols for plot
#to see all shapes -> plot shapes - para escolher os simbolos
N = 100; M = 1000
good.shapes = c(1:25,33:127)
foo = data.frame( x = rnorm(M), y = rnorm(M), s = factor( sample(1:N, M, replace = TRUE) ) )
ggplot(aes(x,y,shape=s ), data=foo ) +
scale_shape_manual(values=good.shapes[1:N]) +
geom_point()
Sample data
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
More sample data
# import sample attributes
samples2 <- read.delim(
here(
"output", "Population_meta_data.txt"
),
head = TRUE
)
samples2<- samples2 |>
dplyr::select(
region, pop
)
# check head of the file
head(samples2)
## region pop
## 1 Central Africa GAB
## 2 East Africa ANT
## 3 East Africa MAD
## 4 East Africa DGV
## 5 East Africa VOH
## 6 Indian Ocean DAU
Merge with sampling_loc
## pop region Pop_City Country Latitude Longitude
## 1 AIZ East Asia Aizuwakamatsu City Japan 37.49240 139.99360
## 2 ALD Southern Europe Durres Albania 41.29704 19.50373
## 3 ALU Eastern Europe Alushta Ukraine 44.68289 34.40368
## 4 ALV Southern Europe Vlore Albania 40.46600 19.48970
## 5 ANT East Africa Antananarivo Madagascar -18.87920 47.50790
## 6 ARM Eastern Europe Ijevan Armenia 40.87971 45.14764
## Continent Year Region Subregion order
## 1 Asia 2008 East Asia NA
## 2 Europe 2018 Southern Europe East Europe 25
## 3 Europe 2021 Eastern Europe East Europe 35
## 4 Europe 2020 Southern Europe East Europe 24
## 5 Africa 2022 East Africa East Africa 76
## 6 Europe 2020 Eastern Europe East Europe 42
Check pops
## [1] SOC SOC SOC SOC SOC SOC
## 42 Levels: ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ... TUH
## [1] 42
Merge
## Population PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 ALD -11.6579 -27.9690 -44.1409 13.07330 -10.5103 2.333360 2.47828
## 2 ALD -11.8254 -31.6101 -51.2300 11.05860 -11.8620 6.509100 -1.88273
## 3 ALD -11.8276 -36.5848 -38.9052 16.13450 -14.1549 10.236100 4.96719
## 4 ALD -10.8478 -33.8094 -46.7079 5.42425 -12.3550 -0.941851 -2.08383
## 5 ALD -12.5341 -26.3646 -43.1096 2.29191 -10.1135 -2.314830 3.81301
## 6 ALD -11.0573 -33.7093 -48.9287 9.59603 -10.4907 8.264140 2.05864
## PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15
## 1 -0.920782 -10.43410 -3.83056 2.97885 9.31085 -14.70810 -3.00870 1.162340
## 2 2.875530 -14.21090 -9.11104 1.77405 16.63980 -15.68680 -5.43410 -2.547860
## 3 -0.502599 -8.25921 -6.28729 2.02247 17.89580 -10.15810 -1.12968 1.460850
## 4 4.994270 -6.65209 -3.28503 -1.25035 2.09205 -2.22973 4.01517 2.377930
## 5 4.184680 -4.85270 1.86280 4.84091 4.73419 -13.52730 -2.37063 0.926806
## 6 -0.850417 -6.76725 -7.93507 2.89587 6.69845 -10.96920 -4.59706 -1.657440
## PC16 PC17 PC18 PC19 PC20 PC21 PC22
## 1 15.11750 0.717381 -12.72930 0.756383 -0.136708 -5.49075 -5.67005
## 2 13.68810 -1.777950 -7.08326 0.496845 3.433940 -3.71409 -4.05632
## 3 14.34960 -2.999160 -8.92470 2.103850 5.999540 -3.38573 -11.14230
## 4 4.82982 2.959250 -0.50269 -0.690329 9.374860 -4.96182 8.85820
## 5 16.07220 -9.001770 12.64850 -15.493100 -12.479500 -15.52130 -14.25180
## 6 11.80420 2.844720 -10.66160 -7.736230 3.562870 -2.09384 -2.48463
## PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
## 1 -5.984290 4.39982 11.78970 -6.835890 -12.66780 -5.31345 6.15332 10.04970
## 2 -9.079700 6.59653 9.66382 -5.435130 -2.22784 -2.29229 4.97826 4.15231
## 3 -14.012500 10.31950 9.33576 -2.193210 1.69822 4.75277 8.26638 4.25581
## 4 -5.097990 -7.18522 3.24951 1.399710 15.97430 4.85988 3.01595 6.83983
## 5 -2.898620 10.72600 8.31180 -5.261930 -6.83206 -3.93861 -4.35314 3.91953
## 6 -0.416598 1.72803 6.83709 0.894634 -26.01630 -3.49035 -10.99340 17.71460
## PC31 PC32 PC33 PC34 PC35 PC36 PC37 PC38
## 1 -3.47651 20.26750 -19.988100 5.24872 23.10030 0.514816 -5.02374 1.22129
## 2 -3.29091 14.99080 -11.121000 -1.99799 12.27580 3.768280 -6.26303 2.58603
## 3 2.09609 4.56566 -1.504930 3.28042 14.29880 2.213280 -16.42230 9.04348
## 4 1.78815 -2.64817 0.343047 -8.96805 8.17103 -2.879770 3.22537 -1.14725
## 5 -4.77921 2.97294 -10.488000 4.21135 7.74877 2.522950 3.36325 1.66936
## 6 -11.10030 -2.62617 -3.539690 -3.34805 -4.94730 4.581160 13.71960 -6.26580
## PC39 PC40 PC41 PC42 PC43 PC44 PC45
## 1 5.1053000 6.006060 -12.207900 12.13160 10.79360 4.86022 -10.28550
## 2 -0.5851700 2.925370 -1.120860 15.82060 15.53930 3.67843 -3.91608
## 3 1.9799600 3.280710 -0.179321 15.78210 11.89110 2.61741 -7.59388
## 4 1.7267700 -0.809569 -10.478600 1.55454 -10.46390 -7.53708 4.39635
## 5 0.0158973 4.682680 -5.116480 6.58365 -2.50899 10.35340 -10.57600
## 6 2.3405300 1.448830 22.478500 3.29390 6.50008 4.78129 -12.30290
## PC46 PC47 PC48 PC49 PC50 PC51 PC52
## 1 -0.4516270 -0.646017 7.01854 10.70270 -2.660740 3.64003 3.91273
## 2 3.7791000 3.127380 10.82160 8.64520 3.076190 5.80129 6.51709
## 3 -0.0192116 5.792690 10.95710 7.39521 -9.464410 10.45780 6.03173
## 4 7.3820500 -9.823890 7.54878 13.26680 -3.435130 3.47748 4.14988
## 5 -7.1985900 2.624570 2.84713 11.46820 5.343910 -6.88548 -1.29110
## 6 -0.0582287 4.698800 -3.19463 -49.07410 -0.175759 -44.09510 -4.38864
## PC53 PC54 PC55 PC56 PC57 PC58 PC59
## 1 -7.16723 18.44990 3.970240 -1.013560 -3.321320 -5.512170 -2.46536
## 2 -3.19758 20.17530 2.631920 3.908730 3.820920 -12.901800 1.01095
## 3 7.13620 20.56140 0.533156 -0.959863 -0.846978 0.792778 6.70576
## 4 -11.73720 7.78464 3.453320 12.232500 -4.937380 -4.240720 -2.34783
## 5 -7.36066 8.78496 -10.027400 2.061130 -7.867880 -2.334410 3.58740
## 6 17.42780 -24.35680 0.276107 -15.192900 1.074910 -18.400000 7.59839
## PC60 PC61 PC62 PC63 PC64 PC65 PC66
## 1 -5.50172 -0.198877 -4.29422 -5.263770 -6.983110 -1.0319800 -3.88479
## 2 -8.61719 -5.557800 -11.38350 -5.834600 -4.856350 5.4841400 -9.18582
## 3 -4.75257 -6.409890 -10.21460 -5.383330 -0.422476 0.0672973 -10.96400
## 4 -10.16650 6.914880 -9.51785 -9.410610 -2.189310 10.0438000 2.55125
## 5 1.33817 -1.317260 -6.76897 -16.210000 6.370330 -3.7056900 9.57391
## 6 13.13430 -1.313810 16.20000 -0.662348 -27.528500 5.6094400 1.23793
## PC67 PC68 PC69 PC70 PC71 PC72 PC73 PC74
## 1 0.45474 -2.58798 -1.88256 0.392983 -4.9525300 2.63946 1.96697 -3.80154
## 2 -6.89015 -2.54897 8.24276 4.964190 0.0250136 3.73288 -10.39420 5.21952
## 3 -17.05930 13.50500 9.03814 17.221700 5.2044700 16.56120 -7.40581 6.28536
## 4 -2.11203 9.36051 8.51656 -5.252570 15.4165000 -4.91737 -2.91107 -2.00968
## 5 5.34755 -9.70900 11.03560 -5.243700 -5.0130100 4.85135 -11.71030 10.43630
## 6 -18.34170 6.84433 1.11399 -2.198430 7.1618700 -2.24213 4.70259 8.08848
## PC75 PC76 PC77 PC78 PC79 PC80 PC81
## 1 0.2808140 -0.717134 -4.952970 5.14396 -1.2568600 17.2206000 0.245875
## 2 4.5493500 -7.646610 -6.297820 9.39346 4.4808100 0.0658922 -0.454177
## 3 1.4116100 -5.022650 -7.394960 4.82979 0.0400216 7.7017600 6.016480
## 4 0.0435947 9.497980 -3.294140 2.86915 14.2435000 3.9936900 -8.997840
## 5 1.9375700 -7.766310 4.058450 7.30916 1.3256900 -3.0882300 -1.572980
## 6 4.3458000 -6.596620 0.290399 -1.89272 4.1835900 16.2519000 -8.745020
## PC82 PC83 PC84 PC85 PC86 PC87 PC88
## 1 -6.49355 -13.769700 -4.196930 -0.386225 -2.446140 14.598300 -2.22194
## 2 2.63605 -9.575460 1.034970 -7.237970 0.569297 -1.094120 7.10518
## 3 3.72600 -17.330800 -1.808960 -5.329970 7.487040 -3.028250 2.84273
## 4 8.61246 -0.943595 0.153017 13.058500 -16.876500 0.204623 -6.98638
## 5 3.99397 -3.697800 6.807590 6.911750 -11.142900 20.780000 -13.89920
## 6 -8.31439 3.764910 -6.007870 -3.693650 1.485260 6.552310 7.78248
## PC89 PC90 PC91 PC92 PC93 PC94 PC95
## 1 -4.027270 7.662140 -20.253600 7.510850 4.68262 -6.54716 1.850000
## 2 5.169720 4.265550 0.180027 -0.965736 0.24376 -5.29317 -12.295900
## 3 -0.857548 0.131778 2.568540 5.449060 -7.84955 -9.03535 -8.120480
## 4 2.634830 -14.270500 7.470540 -0.710916 13.54620 -3.30130 -5.074640
## 5 3.327550 -4.454530 8.348480 -2.481660 1.01278 1.20881 4.482180
## 6 14.913600 -2.936270 -16.309600 -5.341540 9.84932 13.15910 -0.345686
## PC96 PC97 PC98 PC99 PC100 PC101 PC102
## 1 2.40111 -10.014500 1.47668 -1.17489 2.28575 6.76624 -1.21081
## 2 7.33339 4.132870 -12.33440 10.94460 4.02164 6.86795 -4.46914
## 3 11.05800 -0.835219 -3.89829 20.31090 13.54320 5.93717 4.09967
## 4 -12.07020 11.963400 10.91550 6.12285 -15.23450 -12.65260 -3.35621
## 5 1.42141 -15.242600 -12.71620 -11.69190 7.53554 -8.87482 12.71180
## 6 -5.21543 2.701670 -1.89634 2.40607 3.25766 10.03800 2.44705
## PC103 PC104 PC105 PC106 PC107 PC108 PC109
## 1 3.16310 3.963550 -1.250340 -1.467530 -9.3119000 6.625710 12.28200
## 2 0.37859 -3.362290 2.493390 1.536030 0.0571149 -0.598152 6.84769
## 3 5.67197 1.801920 0.474969 -2.703860 -6.3660600 -4.942490 9.01270
## 4 7.69515 -16.931400 -2.965230 -0.586932 -11.0177000 -14.453400 3.11070
## 5 -16.07020 8.996730 0.219746 9.172000 -0.9582000 9.040390 1.21495
## 6 4.65728 -0.287229 11.431100 -0.561919 -4.7217800 0.899694 -13.56220
## PC110 PC111 PC112 PC113 PC114 PC115 PC116
## 1 5.059040 -9.93621 -13.660700 -8.77445 -0.682461 -3.660330 -12.689000
## 2 0.249305 -3.21307 -11.148300 -2.32990 -1.781480 -1.967440 -8.048340
## 3 -14.658000 -8.41044 0.727095 2.56796 -3.483850 -2.843480 -0.135852
## 4 4.662380 12.05560 -3.935680 -5.34607 -8.139620 -2.777610 -4.508910
## 5 -10.397600 -0.69726 10.955000 6.74477 5.405270 0.857081 4.616720
## 6 3.244200 -4.35498 6.422140 -11.33790 0.651984 -1.339550 4.916310
## PC117 PC118 PC119 PC120 PC121 PC122 PC123 PC124
## 1 -1.22395 6.171900 -3.79474 4.01713 -3.213150 -9.986690 17.71150 -7.925910
## 2 -8.41811 1.009510 -6.47856 -3.01031 -0.738815 -0.575996 -1.71960 -12.472200
## 3 -4.78576 -1.691020 -4.60758 -1.14629 11.322800 8.036410 -13.03850 -2.997990
## 4 8.70960 5.331950 -2.30716 -0.90583 -4.932240 -0.151724 -6.09036 -0.318938
## 5 -5.14267 -7.137840 5.73054 -4.03671 5.973340 2.799570 5.12033 -5.253800
## 6 3.91759 -0.428513 -1.24078 7.22804 1.650730 -2.640020 8.18158 -0.896967
## PC125 PC126 PC127 PC128 PC129 PC130 PC131 PC132
## 1 7.87029 0.0574634 10.83080 13.51310 7.06173 1.409660 -3.29115 -2.92033
## 2 -13.72460 1.4450800 6.64383 4.06393 3.34087 2.879020 -8.73574 6.97982
## 3 -5.44186 1.9793000 -9.25389 -8.53558 -9.60394 -5.810790 4.68657 1.80424
## 4 20.81070 7.3628600 15.14950 15.22270 8.53325 -0.540119 4.54803 7.80776
## 5 19.13740 -8.5484300 0.51629 14.94360 17.67410 -18.774200 -6.77861 12.32080
## 6 2.65178 2.4920800 -3.23295 3.33740 4.73122 5.735830 -1.19621 1.15408
## PC133 PC134 PC135 PC136 PC137 PC138 PC139
## 1 4.217050 1.9366100 3.662840 3.01005 -8.65587 3.439540 -2.43056
## 2 -0.472462 0.0180527 -10.773700 -12.34980 -1.75497 20.581800 6.52978
## 3 -8.644410 -11.0338000 2.800320 -1.60253 4.87242 5.104900 11.83690
## 4 2.301550 -8.2496700 0.405541 8.06931 -7.29565 7.827040 -7.92791
## 5 -19.244200 14.1552000 -19.167100 1.49410 -4.31047 -1.309280 -8.55088
## 6 -8.625380 7.4595400 1.352730 5.04378 -2.72548 0.956049 -4.03382
## PC140 PC141 PC142 PC143 PC144 PC145 PC146
## 1 3.23729 2.886750 -3.00369 1.43686 8.1096100 -5.56608 -5.122100
## 2 -12.38840 -0.019887 2.22128 -1.01308 -0.0147786 3.82760 4.821020
## 3 -7.44323 5.888740 6.07072 -3.21879 8.9764700 1.30310 0.164319
## 4 -5.53354 -4.952570 7.46043 7.92769 -12.3246000 -11.51410 8.078670
## 5 -19.07220 -5.849110 3.65776 -13.53510 -2.1913900 -5.20952 9.537730
## 6 -1.28566 -10.377700 5.46230 2.60225 -1.4684500 2.60784 8.845980
## PC147 PC148 PC149 PC150 PC151 PC152 PC153
## 1 -4.190100 -9.58200 3.88227 6.840740 -9.385150 -11.1110000 6.38799
## 2 9.844550 -10.80400 13.76170 -2.409540 0.517485 1.8990300 -9.86676
## 3 -4.687510 11.39380 -11.25480 -2.929910 10.071100 0.0506166 -13.99900
## 4 14.145900 -5.32769 15.92110 -0.147928 12.327900 3.3483700 -6.62975
## 5 6.918080 1.54454 13.86410 10.133700 -8.959470 -2.6304100 4.57863
## 6 -0.856964 -9.80717 -7.02375 0.954003 -1.108610 5.5684900 2.00052
## PC154 PC155 PC156 PC157 PC158 PC159 PC160
## 1 3.395280 8.963890 2.87107 3.003930 -11.40040 3.026950 9.94089
## 2 6.163130 -11.232500 8.92145 5.434450 2.26650 -1.578190 7.40412
## 3 -6.095510 7.085670 -5.85035 -8.315050 8.69606 -1.992120 -8.15777
## 4 4.038690 1.168760 -3.31995 10.542300 -12.65030 -4.410590 11.69480
## 5 -0.101103 -0.971996 3.11635 -3.102040 16.76330 -0.288692 -9.63154
## 6 -7.204990 -3.468040 12.32490 0.100048 1.78026 -3.471010 6.80656
## PC161 PC162 PC163 PC164 PC165 PC166 PC167
## 1 4.360360 -11.24420 -13.13560 -0.561281 -5.58034 -13.991700 -5.3342300
## 2 -0.300207 -9.53374 -17.90680 -6.183740 -6.54931 -4.200990 0.0470631
## 3 -3.431190 -14.26410 8.48463 1.424760 1.50288 11.908400 -2.2567100
## 4 3.957550 1.91547 16.46500 -1.634610 -7.01608 -0.570898 18.4511000
## 5 14.914300 16.98660 6.23391 6.905520 -13.42640 -5.472710 2.0394300
## 6 -8.493320 -1.81089 6.49120 2.699850 7.83358 -5.504900 -3.3484500
## PC168 PC169 PC170 PC171 PC172 PC173 PC174
## 1 -5.246180 2.996990 0.726642 -0.151265 12.02920 6.79858 5.80479
## 2 -0.578584 -6.939650 6.080340 3.880210 3.04088 4.15737 6.37352
## 3 1.264660 12.709700 -1.471710 2.256270 -8.04033 -4.66502 -5.78199
## 4 -11.213000 -14.964600 10.467000 -8.080910 11.10890 4.06190 16.66030
## 5 -7.016290 0.793634 -0.248035 -1.019920 3.65775 -13.84540 -5.93016
## 6 1.193880 1.363600 -4.157180 2.155980 -8.94114 -1.13864 4.27069
## PC175 PC176 PC177 PC178 PC179 PC180 PC181
## 1 -5.356850 -19.706500 7.770840 -21.471400 14.032100 10.265800 3.82050
## 2 -0.575964 16.928100 6.927820 6.010530 1.009190 3.411790 5.56998
## 3 2.187180 -3.826220 -0.931741 17.311900 1.830330 0.817655 6.06723
## 4 -0.829007 1.868220 -0.647000 -8.088390 -0.678108 2.475980 -11.08900
## 5 10.849200 -15.751400 4.454440 -7.627810 -9.399860 8.172540 -6.54801
## 6 4.569130 0.423155 1.631600 -0.266451 0.545985 -3.126050 -5.47633
## PC182 PC183 PC184 PC185 PC186 PC187 PC188
## 1 6.20591 8.8070500 15.964400 3.84788 -2.42656 -0.0809171 8.57243
## 2 2.62604 -10.4667000 1.129650 1.97484 -14.01400 -19.6776000 8.50954
## 3 2.16591 2.5485200 -2.240060 -2.35578 -2.75805 12.6139000 -9.11576
## 4 -12.34140 2.0557000 -4.019630 -3.49606 -6.48537 -9.5355700 6.26296
## 5 1.83441 8.3190700 -3.114360 -8.76134 1.17146 -5.8661800 -1.15634
## 6 -1.60512 0.0927896 -0.100493 -4.88427 2.27058 -0.5628220 -4.58904
## PC189 PC190 PC191 PC192 PC193 PC194 PC195
## 1 -5.77007 -9.58328 17.695000 -1.150770 -5.0184500 -0.500741 7.01297
## 2 -8.50286 7.94532 1.166650 -6.476430 -0.8063870 3.485460 14.80900
## 3 -3.27819 2.91604 11.569300 0.226403 -3.2103300 -6.224160 -5.92130
## 4 1.66453 -11.52020 5.349960 -5.861460 -0.0403829 -3.784520 9.21213
## 5 4.11791 -9.67038 -2.359740 1.898980 22.6605000 13.415300 -2.63150
## 6 3.26554 -1.54080 -0.346248 -0.800325 -0.6995660 1.941880 -2.06252
## PC196 PC197 PC198 PC199 PC200 PC201 PC202
## 1 -3.945050 -8.477160 -0.681499 0.666917 11.30000 4.52734 7.299130
## 2 -17.772200 -7.077820 9.502410 -9.185660 2.10595 6.87414 6.515630
## 3 11.176500 6.524160 1.010000 0.981463 1.77581 1.05263 10.967900
## 4 1.044630 6.197120 -1.421620 1.490290 -3.62312 4.26465 -1.057470
## 5 10.898200 -0.559928 -3.750520 13.205200 12.00690 -12.67170 -12.276900
## 6 -0.391076 -2.729550 1.696110 0.141113 -3.72166 7.46838 0.147472
## PC203 PC204 PC205 PC206 PC207 PC208 PC209
## 1 -2.082350 -23.05500 9.02803 1.72259 7.471070 -5.101650 -10.882100
## 2 8.700440 7.82652 1.85502 7.02641 -5.504200 -1.666840 4.316330
## 3 -10.001700 3.29036 3.86907 -5.20413 5.880860 -12.954300 -1.853360
## 4 -15.697000 5.73974 -5.00117 2.54565 0.902268 0.987977 -3.834410
## 5 -0.746243 -1.36843 -4.81689 -5.69796 7.654070 6.378350 2.189510
## 6 -2.950340 4.81482 -2.19698 3.06647 4.483840 3.692530 -0.216282
## PC210 PC211 PC212 PC213 PC214 PC215 PC216
## 1 7.19151 12.07900 12.923000 1.333130 8.84348 5.14199 -16.530700
## 2 -3.50355 5.73162 9.508960 -9.218450 -2.75026 9.35677 0.607912
## 3 -5.61148 5.46940 10.880500 -0.593966 -5.70412 -13.87200 -9.437590
## 4 7.04686 2.76690 -11.601000 1.500470 -8.28542 -6.77595 2.647080
## 5 3.52930 -4.85541 4.467720 -0.231971 1.62174 -14.12690 7.546860
## 6 -7.42755 0.40343 0.468292 -2.607520 -5.37830 -4.36105 1.371080
## PC217 PC218 PC219 PC220 PC221 PC222 PC223
## 1 -12.9350000 6.222890 2.24184 -12.68170 6.82784 4.749940 -15.079400
## 2 11.9839000 11.299600 4.63353 11.28480 -5.89543 -11.962600 5.733460
## 3 5.8333900 4.212170 -1.46277 1.74288 5.36003 6.019640 4.750220
## 4 -6.8777900 -6.985080 4.83852 6.93020 -11.54490 9.601000 -0.033089
## 5 -0.0460042 -12.110500 -3.92304 -3.27021 -2.14745 5.023910 1.254730
## 6 -0.9450490 0.945727 4.69435 6.51500 3.70622 0.718371 0.531868
## PC224 PC225 PC226 PC227 PC228 PC229 PC230
## 1 15.74960 -12.4049000 8.421440 -4.80110 -0.180886 0.860578 9.015210
## 2 8.22372 2.3600900 19.896500 13.28420 1.740790 -10.649300 10.267200
## 3 4.72768 -2.2774200 -0.216011 -12.00640 5.466540 2.223350 -2.862910
## 4 3.91849 -7.8746700 0.317080 12.04310 5.953100 -5.180840 0.245783
## 5 -4.00310 -0.0934664 -7.940840 4.57040 -18.087500 -6.294870 -11.688500
## 6 -2.34708 3.1825100 -4.036830 3.74678 2.973200 1.840450 -1.688980
## PC231 PC232 PC233 PC234 PC235 PC236 PC237
## 1 -2.530200 5.9910400 -2.83827 -1.65995 -0.895605 13.5927000 -2.37297
## 2 2.334700 -2.8400500 3.09647 -2.26472 11.628500 8.2606400 -2.79769
## 3 0.164236 1.2683700 2.28603 3.26253 6.111320 -1.9872000 -5.23865
## 4 -3.369670 14.7509000 -12.87470 1.08809 -7.080030 -0.0686292 2.99608
## 5 -2.578440 1.4335600 7.52175 -3.15088 -11.992600 7.2423200 4.86159
## 6 0.317631 -0.0654041 -1.77557 1.26512 -1.193590 -0.0298539 -1.30434
## PC238 PC239 PC240 PC241 PC242 PC243 PC244
## 1 3.453260 2.6741500 -2.525270 7.23135 -6.138770 0.8445800 -2.453210
## 2 1.046980 -0.0480712 -10.755500 -1.34918 -7.176770 -8.3114800 -8.152530
## 3 -4.183610 12.0586000 0.346962 14.78980 -2.573900 -4.3154200 6.604240
## 4 -4.438080 -6.4687500 4.680510 6.75897 13.652800 6.4963700 -8.463970
## 5 -2.902360 8.7220800 -15.238100 -9.41990 -0.561574 -2.3839800 0.932261
## 6 -0.125103 1.9735000 0.431890 5.82463 -3.284410 0.0288886 -3.128540
## PC245 PC246 PC247 PC248 PC249 PC250 PC251 PC252
## 1 -5.89010 -10.572500 -1.59359 0.18791 -7.156260 -7.21891 -2.87522 1.44843
## 2 4.43083 3.506130 -3.56553 -1.87446 -8.100160 -12.07620 -6.32810 3.02454
## 3 3.22009 -0.283365 4.99624 -6.42187 0.343386 7.21484 -7.52871 1.04615
## 4 -1.00465 -2.488310 1.08680 -8.98522 -2.684410 5.44405 -1.40658 4.72448
## 5 -9.76459 4.629220 11.13250 7.18500 5.472320 2.97945 2.37315 2.10148
## 6 -0.61058 3.140950 -1.00209 -3.00625 -2.262030 -2.08537 -7.53896 3.59425
## PC253 PC254 PC255 PC256 PC257 PC258 PC259
## 1 5.0180100 7.49993 5.12992 1.542620 -13.228100 -0.303155 5.13629
## 2 1.6973200 -2.85602 -8.13174 1.137610 16.095800 8.214740 -15.59050
## 3 0.0105241 3.81127 -8.39836 2.097310 -5.634590 2.396130 -5.98992
## 4 -1.0874100 3.12900 -0.63428 -9.286460 -8.097390 0.233415 15.79210
## 5 4.1922900 9.24029 4.82078 0.633527 6.604730 -0.102930 1.61733
## 6 4.6487100 2.34682 -2.46844 -2.312360 0.835071 -0.878936 -2.71597
## PC260 PC261 PC262 PC263 PC264 PC265 PC266 PC267
## 1 -3.77831 -19.89700 -11.26510 -11.23140 3.020410 10.82450 4.574950 -3.57970
## 2 6.96233 8.48900 6.81271 2.72244 2.687810 -8.44939 -3.823320 -4.03176
## 3 7.77379 3.77265 -11.32690 -6.27256 6.324970 2.32695 5.871010 -5.41702
## 4 6.43082 16.81350 -3.19179 -4.66300 -4.548910 1.44087 -3.401940 -10.97420
## 5 8.42916 11.17490 4.00527 4.73950 4.158680 1.68798 0.846988 7.37680
## 6 4.19389 2.82012 2.00248 4.76617 0.334303 -3.71840 2.066500 -0.49027
## PC268 PC269 PC270 PC271 PC272 PC273 PC274
## 1 -13.963200 -4.04754 3.511070 -20.653300 10.015400 3.14592 -3.25935
## 2 17.267900 -1.40342 5.230810 4.868920 -2.667110 -4.44074 1.88939
## 3 0.410315 -1.76202 -6.203720 2.901350 -3.362110 -3.13908 -2.81295
## 4 1.547700 9.58130 -1.346580 13.562400 -7.276250 -2.58789 -10.30860
## 5 3.461940 4.46580 9.936390 0.862957 -1.980310 7.96228 4.00161
## 6 1.414960 -0.92966 -0.512087 0.221803 -0.412746 2.21589 3.11833
## PC275 PC276 PC277 PC278 PC279 PC280 PC281
## 1 -19.25650000 -5.330360 -0.771584 6.686020 -9.059650 9.242280 5.202850
## 2 16.58740000 10.307200 3.447760 2.118050 13.287800 -3.021730 -0.926631
## 3 4.92041000 -8.299630 6.766060 -3.220350 4.763250 -0.424483 -5.774610
## 4 0.12012500 -15.113200 -13.472800 -8.277550 0.603387 -0.876582 -3.956690
## 5 -10.85720000 16.060000 3.336100 -2.266130 4.931960 3.500340 2.879100
## 6 -0.00167577 0.801365 0.390391 -0.268441 -3.700800 -2.113290 0.102818
## PC282 PC283 PC284 PC285 PC286 PC287 PC288
## 1 1.5327600 3.66003 -4.091620 -5.348930 8.051240 4.60634 4.23058
## 2 -0.0290444 6.71089 8.943650 8.004830 -5.819240 -2.48291 -1.53252
## 3 -7.7113200 2.57367 3.045750 7.407640 -3.780520 4.83005 -4.80180
## 4 -4.6430400 -13.36380 -14.677700 -4.611060 -0.289894 -3.68193 -4.00447
## 5 -4.4865000 5.57821 9.538420 -0.231728 5.334270 -11.28940 4.76778
## 6 -0.7941230 3.16233 0.806442 0.144405 0.432343 -0.68628 -0.89060
## PC289 PC290 PC291 PC292 PC293 PC294 PC295 PC296
## 1 -3.24404 -3.45227 -6.17873 -1.98265 1.409880 -1.612190 -2.45372 -5.961990
## 2 1.02318 11.73820 -2.25975 -1.08645 -3.680280 1.423690 9.31186 4.612300
## 3 6.18109 -6.95490 11.92160 4.43948 -14.863600 -7.639240 -5.16735 0.535336
## 4 13.48680 -5.39914 9.68505 1.98522 -0.373096 1.657880 2.64050 -6.906240
## 5 6.01770 3.09109 -2.71345 -2.80469 3.570400 9.836240 2.73471 13.731200
## 6 -2.26049 -5.77634 0.30522 2.76081 0.664204 -0.427645 1.11989 -3.116280
## PC297 PC298 PC299 PC300 PC301 PC302 PC303
## 1 -5.883550 -3.12639 5.08253 -7.1961900 0.0546971 -1.914440 -1.395160
## 2 19.411400 -17.02420 -9.86059 13.5820000 4.0878100 -1.436280 8.974190
## 3 -14.697700 11.90760 -2.49708 -6.2355000 -1.5890600 4.503080 -0.722312
## 4 6.917020 15.22240 -2.83759 8.4052200 -1.6946600 -2.584370 1.012650
## 5 -0.873656 -4.00498 -1.45914 0.0712398 5.2929100 -2.796200 1.268480
## 6 -3.610420 -1.14764 -1.18998 -1.3689300 -1.5415800 -0.220665 -1.663930
## PC304 PC305 PC306 PC307 PC308 PC309 PC310
## 1 2.908920 -5.55177 7.498670 -9.15920 1.006620 -0.0175312 0.0940361
## 2 3.145740 -2.77204 -8.959330 1.91510 -3.177480 -3.7121900 2.7188700
## 3 -9.784070 14.48100 1.105010 13.03460 -5.623080 1.7157300 -16.7006000
## 4 2.135280 -7.32511 3.848170 9.41896 -1.079820 -0.2941830 2.6426200
## 5 -4.550440 10.39170 -2.198810 3.50245 0.873491 -0.7903520 -5.5335500
## 6 0.345752 -3.04700 -0.501176 2.81152 -0.465367 1.0523400 1.8759900
## PC311 PC312 PC313 PC314 PC315 PC316 PC317
## 1 3.2415600 -0.533679 -1.25471e+00 -2.344580 3.682100 3.202760 3.005340
## 2 -0.6110050 5.695760 -1.98231e+00 -0.192491 -2.713440 -8.901960 -1.877500
## 3 -19.3327000 0.111527 2.55907e+00 8.599350 0.278793 6.849740 0.300789
## 4 -3.0960400 -4.208450 -2.06772e+00 -1.743580 -2.021830 6.088350 -4.306430
## 5 0.0949942 0.840990 2.95309e+00 1.777290 -2.271260 -2.636700 3.748370
## 6 -0.2767070 1.228160 -2.75052e-06 2.600050 -2.435510 0.950568 -3.463260
## PC318 PC319 PC320 PC321 PC322 PC323 PC324
## 1 7.731760 -0.241106 -2.83840 1.98291 -2.21665 -1.3555800 1.382100
## 2 -4.441470 -1.810390 4.18181 -6.06096 3.26692 0.0796167 4.892500
## 3 -6.208050 10.515500 -7.46418 10.96970 -4.95853 -2.3344500 -12.898900
## 4 -1.354520 0.393562 -5.12320 -5.12295 6.36029 3.3190400 -2.668100
## 5 -0.190006 2.378530 1.52795 4.88849 -3.95607 -2.7324700 -1.281490
## 6 0.602262 -0.256680 -1.26750 -2.85256 -0.42916 -2.1571300 -0.639799
## PC325 PC326 PC327 PC328 PC329 PC330 PC331
## 1 -1.33398 -1.21593 1.087590 0.647461 0.169701 3.459440 -0.237852
## 2 10.03150 -4.06625 2.225630 -3.258690 0.492319 2.011580 3.139470
## 3 -12.22180 6.70498 -4.405400 1.769940 -0.507482 3.810530 -5.409120
## 4 -2.66243 -0.70981 -0.407433 2.177580 2.096280 -1.882490 -5.155170
## 5 -4.53147 5.40499 -6.598650 -3.071510 -1.240230 2.389070 0.858653
## 6 -1.08633 2.21093 -0.784808 -1.885140 2.541550 -0.213186 1.503490
## PC332 PC333 PC334 PC335 PC336 PC337 PC338
## 1 1.954150 2.1152100 1.120480 3.476900 4.296780 -0.7847490 2.703400
## 2 2.362990 6.6671500 -4.168180 -0.165620 0.862123 -0.0846367 1.536480
## 3 -11.232500 -11.8589000 2.353630 -2.437900 -2.452390 1.1418900 -11.995100
## 4 0.198402 -8.2744000 -1.076350 -4.302380 -1.246480 -1.2986700 -8.938090
## 5 -1.933620 -3.8209200 -1.689330 -0.449452 2.084110 -0.3653080 -1.087840
## 6 -3.620920 0.0678934 -0.266918 0.562005 -1.366430 -1.0840000 -0.707899
## PC339 PC340 PC341 PC342 PC343 PC344 PC345
## 1 0.0648793 -2.8283800 -0.0171229 1.369510 0.809884 0.139422 -1.4413200
## 2 -2.1246200 -0.0913371 -1.5283100 -0.592001 -0.904063 -2.925180 -0.9110750
## 3 -2.2727500 2.4252400 4.0403900 -7.649580 0.077515 3.334200 2.5907800
## 4 -0.7906500 1.9549800 1.7430200 0.740683 -2.980980 1.431890 0.0463297
## 5 -4.5678600 1.3606300 4.0414200 -6.849060 1.004470 0.942231 0.9938950
## 6 0.2204090 -0.5789740 0.7184660 -2.161640 1.027610 -0.659699 0.9437470
## PC346 PC347 PC348 PC349 PC350 PC351 PC352
## 1 1.555520 1.62938 0.193871 0.806673 3.681670 0.157506 2.261070
## 2 -2.141840 -2.26500 2.655180 0.684727 0.511199 -2.771200 -0.327101
## 3 -1.883540 -1.24766 -2.277650 -3.945900 -2.344300 3.911330 -1.120040
## 4 0.324020 -2.26947 -2.192310 0.992996 -0.395919 3.251930 -2.697270
## 5 -0.522512 -4.64829 1.447080 -0.393419 -0.469037 -0.214352 0.890393
## 6 -1.185560 2.12914 0.317007 0.109131 -0.574404 1.570330 -0.647675
## PC353 PC354 PC355 PC356 PC357 PC358 PC359
## 1 -0.747627 -1.360860 -0.8677220 -1.459610 0.5237200 -0.7379190 -0.496558
## 2 3.035450 3.092190 1.3426000 0.178390 0.0350144 0.3981450 -1.542860
## 3 -1.492490 0.592193 2.4164800 -1.694690 0.5136770 -1.5379200 1.960710
## 4 0.302172 1.444850 0.0238652 0.553455 -2.3594900 -2.6307300 -0.774727
## 5 -1.137630 0.386571 0.3044080 0.283366 0.1792130 0.6196390 -0.553634
## 6 -0.153455 -0.939886 -1.4121800 0.369414 -0.3625490 0.0751368 0.140945
## PC360 PC361 PC362 PC363 PC364 PC365 PC366
## 1 0.1005630 0.601838 -0.9152040 -1.136820 -0.978292 -0.536523 -0.320685
## 2 0.5257630 0.228573 0.0640412 1.145420 0.302891 -0.622729 -0.781707
## 3 -1.7631300 -0.934291 1.4760800 -0.156592 0.488529 -1.534110 0.177075
## 4 2.3956900 0.749763 1.5371900 -1.261380 0.829789 -1.514290 0.441355
## 5 1.5431700 0.235257 1.6635500 -1.188140 1.256030 -1.729400 -1.446580
## 6 0.0811594 -2.022910 -0.4864640 -0.180249 0.897919 -1.228760 -1.956150
## PC367 PC368 PC369 PC370 PC371 PC372 PC373
## 1 -0.693016 -0.4889840 1.553440 0.734916 -1.422660 1.232990 0.5801460
## 2 0.891434 1.1478200 1.071430 0.859475 2.959240 -0.189035 -0.0400263
## 3 -1.036580 -0.0768219 -3.092140 -0.241839 -1.353450 -1.145130 -0.0251595
## 4 0.131697 1.8047000 0.571333 0.116638 -1.314770 -1.305650 0.2663600
## 5 0.628864 -1.7677600 -1.580020 -0.846520 0.492426 0.268055 0.8404610
## 6 -1.282430 -0.9532700 -0.336196 2.567990 -1.191800 -0.144572 1.4176200
## PC374 PC375 PC376 PC377 PC378 PC379 PC380
## 1 0.342811 -0.3861330 -0.1771980 0.0480706 -0.172883 -0.236881 0.0308104
## 2 -0.148380 -0.6543820 1.4316000 -1.3863800 -0.741940 -0.586389 0.0516557
## 3 0.193468 1.6113500 0.1261890 0.8570780 0.923603 -0.649514 -0.9423180
## 4 1.153350 -0.4596140 0.2307100 -0.3895980 2.429950 -0.583954 -0.0376098
## 5 -0.580536 -0.2591580 0.0488473 -0.7785360 -0.255229 -0.159154 -0.2663450
## 6 -0.852078 -0.0267918 -1.6417000 -0.2971980 -0.668131 0.782563 1.7953000
## PC381 PC382 PC383 PC384 PC385 PC386 PC387
## 1 -0.1561710 0.948769 0.37848000 0.26300400 -0.125007 -1.592020 0.0750528
## 2 0.4313090 -1.001140 -0.28224700 -0.29224800 0.209364 0.023621 -0.1780050
## 3 1.6887900 -1.086440 0.80570000 0.22565200 0.345213 1.104060 1.0186600
## 4 -0.0920244 -0.377783 -0.00251592 -0.00534122 -0.278711 0.807360 -0.3682320
## 5 -0.6395330 -0.176339 1.21101000 -1.30166000 0.169436 1.009270 -0.5679560
## 6 -3.5984800 1.552660 3.31903000 -1.19246000 -0.764001 -1.502900 -5.2552900
## PC388 PC389 PC390 PC391 PC392 PC393 PC394
## 1 0.2936720 -1.0676200 -0.145780 -0.2267020 0.0253611 1.09868 0.278362
## 2 -0.0186367 0.1388060 0.496813 0.2309030 0.0553485 -2.36411 -0.236082
## 3 0.2178940 0.3764090 0.397538 -0.2400100 -0.0799731 -1.16805 0.357309
## 4 0.2112230 -0.8968090 -0.270367 -0.0734400 -0.1502990 1.33542 -0.140869
## 5 0.3254840 -0.0777772 0.479137 0.0557618 0.4172770 -1.21180 -0.273647
## 6 -7.0977400 -0.6925200 -3.425750 -1.5764100 1.4343600 34.36290 7.729560
## PC395 PC396 PC397 PC398 PC399 PC400 PC401
## 1 -0.279925 0.624945 -0.155042 0.1055230 0.270818 0.4827090 0.195881
## 2 1.184070 0.408101 0.287312 -0.3405600 -0.510715 0.6201510 -0.066248
## 3 0.762002 -0.674437 0.570041 -0.0531984 -1.053780 -0.3692760 -0.770869
## 4 -0.199232 -1.007120 -0.709010 0.7651120 -0.481188 -0.0275808 0.695200
## 5 0.884068 0.283521 0.304547 -0.9347460 0.736372 0.4340780 0.277280
## 6 -28.879200 -1.130180 -1.719540 -3.6083800 1.296860 -0.3888750 11.269700
## PC402 PC403 PC404 PC405 PC406 PC407 PC408
## 1 -0.0161798 -0.2691690 -0.1744020 0.0750264 0.3163120 0.4292230 -0.2823020
## 2 0.5865710 -0.0746562 -0.0937750 -0.0982788 -0.5053210 0.1581330 -0.0426388
## 3 0.0296957 0.8638460 -0.2417780 -0.2457440 0.3184530 -0.5076330 0.3541860
## 4 0.4484770 0.1638630 0.0793268 -0.3100790 -0.0686629 -0.1055580 -0.0276733
## 5 -0.7107760 0.3231550 0.1319450 -0.1099410 0.1297680 0.0739349 -0.0132180
## 6 1.0707500 1.0804400 1.1230100 -0.0877193 -0.1232730 -0.4902600 0.0947873
## PC409 Individual region Pop_City Country Latitude Longitude
## 1 -2.38321e-06 801 Southern Europe Durres Albania 41.29704 19.50373
## 2 -2.38321e-06 802 Southern Europe Durres Albania 41.29704 19.50373
## 3 -2.38321e-06 803 Southern Europe Durres Albania 41.29704 19.50373
## 4 -2.38321e-06 804 Southern Europe Durres Albania 41.29704 19.50373
## 5 -2.38321e-06 805 Southern Europe Durres Albania 41.29704 19.50373
## 6 -2.38321e-06 806 Southern Europe Durres Albania 41.29704 19.50373
## Continent Year Region Subregion order
## 1 Europe 2018 Southern Europe East Europe 25
## 2 Europe 2018 Southern Europe East Europe 25
## 3 Europe 2018 Southern Europe East Europe 25
## 4 Europe 2018 Southern Europe East Europe 25
## 5 Europe 2018 Southern Europe East Europe 25
## 6 Europe 2018 Southern Europe East Europe 25
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
good.shapes = c(1:25,35:38,1:25)
colors2 <-
c("#E69F00",
"#799d10",
"#0072B2",
"yellow3",
"#B22222",
"#808080",
"#21a708",
"#FF7F00",
"#52ef99",
"#8E8BFF",
"#F781BF",
"#8B008B",
"purple",
"#2524f9",
"#E7297A",
"#1E90FF",
"chocolate4")
# make plot by country & region
ggplot(merged_data, aes(PC1, PC2)) +
geom_point(aes(color = Country, shape = Country), size = 1) +
xlab(paste0("PC1 (", perc[1], " Variance)")) +
ylab(paste0("PC2 (", perc[2], " Variance)")) +
labs(
caption = "PCA with 47,484 SNPs of 409 mosquitoes from 41 localities in Europe."
) +
guides(
color = guide_legend(title = "Country", ncol = 3),
shape = guide_legend(title = "Country", ncol = 3),
fill = guide_legend(title = "Region", ncol = 1)
) +
stat_ellipse(aes(fill = Region, group = Region), geom = "polygon", alpha = 0.2, level = 0.8) +
scale_color_manual(values = colors2) +
scale_shape_manual(values=good.shapes) +
my_theme() +
theme(
plot.caption = element_text(face = "italic"),
legend.position = "top",
legend.justification = "top",
legend.box.just = "center",
legend.box.background = element_blank(),
plot.margin = margin(5.5, 25, 5.5, 5.5, "points"),
legend.margin = margin(10,10,10,10)
)
#save the pca plot
ggsave(
here(
"output", "europe", "lea", "PCA_lea_pc1_pc2_r1.pdf"
),
width = 8,
height = 6,
units = "in"
)
PC1 and PC3
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
good.shapes = c(1:25,35:38,1:25)
colors2 <-
c("#E69F00",
"#799d10",
"#0072B2",
"yellow3",
"#B22222",
"#808080",
"#21a708",
"#FF7F00",
"#52ef99",
"#8E8BFF",
"#F781BF",
"#8B008B",
"purple",
"#2524f9",
"#E7297A",
"#1E90FF",
"chocolate4")
# make plot by country & region
ggplot(merged_data, aes(PC1, PC3)) +
geom_point(aes(color = Country, shape = Country), size = 1) +
xlab(paste0("PC1 (", perc[1], " Variance)")) +
ylab(paste0("PC3 (", perc[3], " Variance)")) +
labs(
caption = "PCA with 47,484 SNPs of 409 mosquitoes from 41 localities in Europe."
) +
guides(
color = guide_legend(title = "Country", ncol = 3),
shape = guide_legend(title = "Country", ncol = 3),
fill = guide_legend(title = "Region", ncol = 1)
) +
stat_ellipse(aes(fill = Region, group = Region), geom = "polygon", alpha = 0.2, level = 0.8) +
scale_color_manual(values = colors2) +
scale_shape_manual(values=good.shapes) +
my_theme() +
theme(
plot.caption = element_text(face = "italic"),
legend.position = "top",
legend.justification = "top",
legend.box.just = "center",
legend.box.background = element_blank(),
plot.margin = margin(5.5, 25, 5.5, 5.5, "points"),
legend.margin = margin(10,10,10,10)
)
#save the pca plot
ggsave(
here(
"output", "europe", "lea", "PCA_lea_pc1_pc3_r1.pdf"
),
width = 8,
height = 6,
units = "in"
)
PC1 and PC4
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
good.shapes = c(1:25,35:38,1:25)
colors2 <-
c("#E69F00",
"#799d10",
"#0072B2",
"yellow3",
"#B22222",
"#808080",
"#21a708",
"#FF7F00",
"#52ef99",
"#8E8BFF",
"#F781BF",
"#8B008B",
"purple",
"#2524f9",
"#E7297A",
"#1E90FF",
"chocolate4")
# make plot by country & region
ggplot(merged_data, aes(PC1, PC4)) +
geom_point(aes(color = Country, shape = Country), size = 1) +
xlab(paste0("PC1 (", perc[1], " Variance)")) +
ylab(paste0("PC4 (", perc[4], " Variance)")) +
labs(
caption = "PCA with 47,484 SNPs of 409 mosquitoes from 41 localities in Europe."
) +
guides(
color = guide_legend(title = "Country", ncol = 3),
shape = guide_legend(title = "Country", ncol = 3),
fill = guide_legend(title = "Region", ncol = 1)
) +
stat_ellipse(aes(fill = Region, group = Region), geom = "polygon", alpha = 0.2, level = 0.8) +
scale_color_manual(values = colors2) +
scale_shape_manual(values=good.shapes) +
my_theme() +
theme(
plot.caption = element_text(face = "italic"),
legend.position = "top",
legend.justification = "top",
legend.box.just = "center",
legend.box.background = element_blank(),
plot.margin = margin(5.5, 25, 5.5, 5.5, "points"),
legend.margin = margin(10,10,10,10)
)
We will do 5 repetitions
# set output dir
# main options
# K = number of ancestral populations
# entropy = TRUE computes the cross-entropy criterion, # CPU = 4 is the number of CPU used (hidden input) project = NULL
project = snmf(
genotype,
K = 1:20,
project = "new",
repetitions = 5,
CPU = 4,
entropy = TRUE
)
project = load.snmfProject("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmfProject")
Cross entropy
# Open a new pdf file
pdf(here("output","europe","lea","lea_cross_entropy_europe_r1.pdf"), width = 6, height = 4)
# Create your plot
plot(project, col = "blue", pch = 19, cex = 1.2)
# Close the pdf file
dev.off()
## png
## 2
Plot begins to plateau around 14-15 (though it does decrease slightly all the way until k=19)
Summary of project
## $repetitions
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy 5 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5 5
## K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
## K = 18 K = 19 K = 20
## with cross-entropy 5 5 5
## without cross-entropy 0 0 0
## total 5 5 5
##
## $crossEntropy
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7
## min 0.8809779 0.8620474 0.8512469 0.8468877 0.8413341 0.8379157 0.8353965
## mean 0.8818355 0.8631570 0.8522017 0.8479543 0.8424554 0.8391783 0.8365604
## max 0.8824166 0.8640562 0.8529647 0.8487077 0.8434736 0.8407652 0.8383382
## K = 8 K = 9 K = 10 K = 11 K = 12 K = 13 K = 14
## min 0.8323692 0.8306828 0.8298577 0.8273704 0.8272263 0.8256213 0.8241121
## mean 0.8342920 0.8320449 0.8304758 0.8289451 0.8280680 0.8266444 0.8256448
## max 0.8362286 0.8332221 0.8314288 0.8301834 0.8294269 0.8275571 0.8268800
## K = 15 K = 16 K = 17 K = 18 K = 19 K = 20
## min 0.8238352 0.8228338 0.8228329 0.8216179 0.8210350 0.8215349
## mean 0.8248575 0.8242061 0.8238525 0.8230369 0.8231018 0.8226386
## max 0.8259760 0.8253172 0.8258814 0.8248736 0.8244063 0.8231648
# get the cross-entropy of all runs for K = 15
ce = cross.entropy(project, K = 15)
ce #run 5 is best for k=15
## K = 15
## run 1 0.8242023
## run 2 0.8245658
## run 3 0.8257083
## run 4 0.8259760
## run 5 0.8238352
replace k15 with whatever is best
Default plot
# select the best run for K = 15 clusters
best = which.min(cross.entropy(project, K = 15))
# best is run 5
barchart(project, K = 15, run = best,
border = NA, space = 0,
col = colors2,
xlab = "Individuals",
ylab = "Ancestry proportions",
main = "Ancestry matrix") -> bp
axis(1, at = 1:length(bp$order),
labels = bp$order, las=1,
cex.axis = .4)
Mean admixture by country using ggplot
color_palette <-
c(
"blue",
"#008080",
"#B22222",
"#1E90FF",
"purple",
"#FF8C1A",
"#FFFF99",
"purple4",
"chocolate4",
"#FFB347",
"#F49AC2",
"green4",
"#B20CD9",
"#77DD77",
"green"
)
sampling_loc <- readRDS(here("output", "sampling_loc_all.rds"))
library(reshape2)
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 15, run = best))
# Create a named vector to map countries to regions
country_to_region <- c(
"France" = "1-Western Europe",
"Portugal" = "2-Southern Europe",
"Spain" = "2-Southern Europe",
"Italy" = "2-Southern Europe",
"Malta" = "2-Southern Europe",
"Slovenia" = "2-Southern Europe",
"Croatia" = "2-Southern Europe",
"Albania" = "2-Southern Europe",
"Serbia" = "2-Southern Europe",
"Greece" = "2-Southern Europe",
"Romania" = "3-Eastern Europe",
"Bulgaria" = "3-Eastern Europe",
"Turkey" = "3-Eastern Europe",
"Ukraine" = "3-Eastern Europe",
"Russia" = "3-Eastern Europe",
"Georgia" = "3-Eastern Europe",
"Armenia" = "3-Eastern Europe"
)
# Add the region to the data frame
sampling_loc$Region2 <- country_to_region[sampling_loc$Country]
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Region2, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here(
"analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=15.") +
scale_fill_manual(values = color_palette) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) # Remove Region prefix from labels
# save the pca plot ####
ggsave(
here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/LEA_k=15_r1_countries.pdf"
),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Using ggplot2 for individual admixtures
leak15 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K15/run5/r2_0.1_r5.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak15)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.00333 0.0000999 0.0000999 3.92e-1 7.68e-2 3.52e-1 9.99e-5 9.99e-5 9.99e-5
## 2 0.0000999 0.0000999 0.0000999 6.95e-1 9.99e-5 6.51e-2 9.99e-5 1.86e-3 9.99e-5
## 3 0.0000999 0.0000999 0.0000999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.000100 0.000674 0.000723 5.37e-1 2.87e-2 3.18e-1 2.58e-2 2.30e-2 1.00e-4
## 5 0.0114 0.0169 0.0000999 3.91e-1 5.67e-2 4.95e-1 9.99e-5 9.99e-5 2.87e-2
## 6 0.0000999 0.0000999 0.0000999 3.14e-1 6.55e-2 3.16e-1 9.99e-5 9.99e-5 8.82e-3
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 3.33174e-03 9.99370e-05 9.99370e-05 3.91670e-01 7.68396e-02
## 2 1066 SOC 9.99100e-05 9.99100e-05 9.99100e-05 6.95210e-01 9.99100e-05
## 3 1067 SOC 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 1068 SOC 9.99550e-05 6.73902e-04 7.22595e-04 5.37351e-01 2.86666e-02
## 5 1069 SOC 1.14145e-02 1.68936e-02 9.99190e-05 3.90849e-01 5.66776e-02
## 6 1070 SOC 9.99460e-05 9.99460e-05 9.99460e-05 3.14370e-01 6.54754e-02
## X6 X7 X8 X9 X10 X11
## 1 3.52274e-01 9.99370e-05 9.99370e-05 9.99370e-05 0.132036000 9.99370e-05
## 2 6.51423e-02 9.99100e-05 1.86118e-03 9.99100e-05 0.204138000 9.99100e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 0.998602000 9.98737e-05
## 4 3.18323e-01 2.57999e-02 2.29808e-02 9.99550e-05 0.000099955 9.99550e-05
## 5 4.94572e-01 9.99190e-05 9.99190e-05 2.86936e-02 0.000099919 9.99190e-05
## 6 3.15551e-01 9.99460e-05 9.99460e-05 8.81821e-03 0.191967000 1.04828e-03
## X12 X13 X14 X15
## 1 1.24674e-02 3.02801e-02 4.01781e-04 9.99370e-05
## 2 9.99100e-05 3.26500e-02 9.99100e-05 9.99100e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 1.85801e-02 1.17723e-02 9.99550e-05 3.46303e-02
## 5 9.99190e-05 9.99190e-05 9.99190e-05 9.99190e-05
## 6 6.38827e-02 1.56492e-02 2.26381e-02 9.99460e-05
Rename the columns
# Rename the columns starting from the third one
leak15 <- leak15 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak15)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 3.33174e-03 9.99370e-05 9.99370e-05 3.91670e-01 7.68396e-02
## 2 1066 SOC 9.99100e-05 9.99100e-05 9.99100e-05 6.95210e-01 9.99100e-05
## 3 1067 SOC 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 1068 SOC 9.99550e-05 6.73902e-04 7.22595e-04 5.37351e-01 2.86666e-02
## 5 1069 SOC 1.14145e-02 1.68936e-02 9.99190e-05 3.90849e-01 5.66776e-02
## 6 1070 SOC 9.99460e-05 9.99460e-05 9.99460e-05 3.14370e-01 6.54754e-02
## v6 v7 v8 v9 v10 v11
## 1 3.52274e-01 9.99370e-05 9.99370e-05 9.99370e-05 0.132036000 9.99370e-05
## 2 6.51423e-02 9.99100e-05 1.86118e-03 9.99100e-05 0.204138000 9.99100e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05 0.998602000 9.98737e-05
## 4 3.18323e-01 2.57999e-02 2.29808e-02 9.99550e-05 0.000099955 9.99550e-05
## 5 4.94572e-01 9.99190e-05 9.99190e-05 2.86936e-02 0.000099919 9.99190e-05
## 6 3.15551e-01 9.99460e-05 9.99460e-05 8.81821e-03 0.191967000 1.04828e-03
## v12 v13 v14 v15
## 1 1.24674e-02 3.02801e-02 4.01781e-04 9.99370e-05
## 2 9.99100e-05 3.26500e-02 9.99100e-05 9.99100e-05
## 3 9.98737e-05 9.98737e-05 9.98737e-05 9.98737e-05
## 4 1.85801e-02 1.17723e-02 9.99550e-05 3.46303e-02
## 5 9.99190e-05 9.99190e-05 9.99190e-05 9.99190e-05
## 6 6.38827e-02 1.56492e-02 2.26381e-02 9.99460e-05
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation
## 1 Franceville Gabon -1.59207 13.53242 Africa GAB
## 2 Antananarivo Madagascar -18.87920 47.50790 Africa ANT
## 3 Diego ville Madagascar -12.27361 49.29372 Africa DGV
## 4 Morondava Madagascar -20.28420 44.27940 Africa MAD
## 5 Vohimasy Madagascar -22.81591 47.75026 Africa VOH
## 6 Dauguet Mauritius -20.18530 57.52154 Africa DAU
## Year Region Subregion order
## 1 2015 Central Africa Africa 72
## 2 2022 East Africa East Africa 76
## 3 2022 East Africa East Africa 77
## 4 2016 East Africa East Africa 78
## 5 2016 & 2017 East Africa East Africa 79
## 6 2022 Indian Ocean Indian Ocean 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak15 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"purple4",
"chocolate4",
"#FF8C1A",
"#FFFF99",
"purple",
"blue",
"yellow",
"#F49AC2",
"#77DD77",
"#008080",
"#B20CD9",
"green4",
"#1E90FF",
"#B22222",
"green"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n LEA inference for k15 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
## [1] 5
# Extract ancestry coefficients
leak20 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K20/run5/r2_0.1_r5.20.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak20)
## # A tibble: 6 × 20
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0000999 0.407 0.0000999 2.68e-2 9.99e-5 9.99e-5 3.46e-4 5.68e-2 9.99e-5
## 2 0.146 0.703 0.0000999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 3 0.0000998 0.0000998 0.0000998 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5 9.98e-5
## 4 0.181 0.507 0.00793 9.99e-5 9.62e-3 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 5 0.0395 0.325 0.00142 9.99e-5 9.99e-5 1.10e-2 9.99e-5 3.67e-2 9.99e-5
## 6 0.00424 0.143 0.0000999 1.61e-2 9.99e-5 9.99e-5 9.99e-5 6.70e-2 9.99e-5
## # ℹ 11 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>, X16 <dbl>, X17 <dbl>, X18 <dbl>, X19 <dbl>, X20 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak20 <- leak20 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak20 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette20 <-
c(
"gray",
"#FFB347",
"#B22222",
"orchid",
"chocolate4",
"green",
"purple",
"#F49AC2",
"#B20CD9",
"#75FAFF",
"goldenrod3",
"#FFFF99",
"green4",
"#008080",
"blue",
"#1E90FF",
"#77DD77",
"#FF8C1A",
"purple4",
"yellow"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:20)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette20[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=20.\n LEA inference for k20 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette20) +
expand_limits(y = c(0, 1.5))
## [1] 5
Extract ancestry coefficients
# Extract ancestry coefficients
leak14 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K14/run5/r2_0.1_r5.14.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak14)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0000999 0.402 0.00399 1.43e-1 2.88e-2 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 2 0.0000999 0.661 0.0000999 2.26e-1 3.79e-2 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 3 0.0000999 0.0000999 0.0000999 9.99e-1 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.0147 0.532 0.000100 1.00e-4 1.22e-2 7.39e-3 2.22e-2 2.64e-3 1.46e-2
## 5 0.0000999 0.415 0.00907 9.99e-5 9.99e-5 7.68e-3 9.99e-5 1.33e-2 9.99e-5
## 6 0.000100 0.340 0.000100 1.90e-1 1.30e-2 2.79e-3 1.00e-4 1.00e-4 1.00e-4
## # ℹ 5 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak14 <- leak14 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak14 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette14 <-
c(
"yellow",
"#B20CD9",
"#B22222",
"chocolate4",
"green",
"#FF8C1A",
"purple",
"#F49AC2",
"purple4",
"#008080",
"#77DD77",
"blue",
"green4",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:14)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette14[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=14.\n LEA inference for k20 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette14) +
expand_limits(y = c(0, 1.5))
## [1] 5
Extract ancestry coefficients
# Extract ancestry coefficients
leak13 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K13/run5/r2_0.1_r5.13.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak13)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.579 0.0000999 0.00371 9.99e-5 9.99e-5 2.32e-2 3.75e-2 9.99e-5 8.99e-2
## 2 0.571 0.0000999 0.0000999 9.99e-5 9.99e-5 9.99e-5 4.05e-2 1.15e-2 6.91e-3
## 3 0.0000999 0.0000999 0.0000999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.687 0.000100 0.00336 2.53e-2 5.06e-3 2.26e-2 2.43e-2 6.43e-3 4.73e-2
## 5 0.739 0.00488 0.0112 6.12e-3 2.53e-2 1.00e-4 5.51e-3 3.83e-2 7.26e-2
## 6 0.542 0.000100 0.000100 2.74e-4 1.00e-4 7.94e-2 1.97e-2 1.93e-2 6.50e-2
## # ℹ 4 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak13 <- leak13 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak13 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette13 <-
c(
"#B22222",
"blue",
"chocolate4",
"#1E90FF",
"#FF8C1A",
"#008080",
"#77DD77",
"yellow",
"#F49AC2",
"green",
"#FFFF99",
"purple4",
"#B20CD9"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette13[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n LEA inference for k13 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette13) +
expand_limits(y = c(0, 1.5))
## [1] 5
leak18 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K18/run5/r2_0.1_r5.18.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# unseen_pckmeans.7.Q
# pckmeans.7.Q
head(leak18)
## # A tibble: 6 × 18
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0000999 0.250 0.0000999 4.11e-2 9.99e-5 4.26e-1 9.99e-5 9.99e-5 9.99e-5
## 2 0.0000999 0.102 0.0000999 9.99e-5 9.99e-5 3.86e-1 9.99e-5 9.99e-5 9.99e-5
## 3 0.0000999 0.0000999 0.0000999 9.99e-5 9.99e-5 6.23e-4 9.99e-5 9.99e-5 9.99e-5
## 4 0.0173 0.292 0.0000999 9.18e-3 9.99e-5 2.57e-1 6.09e-4 9.99e-5 1.17e-2
## 5 0.000966 0.391 0.0000999 4.63e-2 9.99e-5 9.55e-2 1.24e-2 9.99e-5 9.99e-5
## 6 0.0000999 0.112 0.0000999 4.76e-2 1.63e-2 4.63e-2 9.99e-5 9.99e-5 1.55e-2
## # ℹ 9 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>, X16 <dbl>, X17 <dbl>, X18 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak18 <- leak18 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak18 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette18 <-
c(
"#1E90FF",
"green",
"yellow",
"blue",
"purple",
"#FF8C1A",
"chocolate4",
"green4",
"#F49AC2",
"#B20CD9",
"#B22222",
"#008080",
"#77DD77",
"#75FAFF",
"orchid",
"purple4",
"#FFFF99",
"#FFB347"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:18)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette18[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=18.\n LEA inference for k18 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette18) +
expand_limits(y = c(0, 1.5))
## [1] 5
# Extract ancestry coefficients
leak5 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K5/run5/r2_0.1_r5.5.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0119 0.0457 0.000100 0.841 0.102
## 2 0.0172 0.000100 0.00172 0.933 0.0482
## 3 0.000100 0.000100 0.000100 1.00 0.000100
## 4 0.0185 0.0410 0.0452 0.800 0.0957
## 5 0.000100 0.0531 0.000100 0.830 0.117
## 6 0.0388 0.0824 0.00769 0.789 0.0819
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette5 <-
c(
"#77DD37",
"#1E90FF",
"#FF8C1A",
"purple3",
"#FFFF19"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette5[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n LEA inference for k5 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette5) +
expand_limits(y = c(0, 1.5))
## [1] 5
# Extract ancestry coefficients
leak6 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K6/run5/r2_0.1_r5.6.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak6)
## # A tibble: 6 × 6
## X1 X2 X3 X4 X5 X6
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0686 0.0773 0.000116 0.0123 0.839 0.00325
## 2 0.000100 0.0586 0.000100 0.00734 0.930 0.00402
## 3 0.000100 0.000100 0.000100 0.000100 1.00 0.000100
## 4 0.0437 0.0888 0.0239 0.0133 0.786 0.0444
## 5 0.0242 0.107 0.0468 0.000100 0.822 0.000100
## 6 0.117 0.0575 0.000100 0.0401 0.780 0.00506
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak6 <- leak6 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak6 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette6 <-
c(
"red",
"#1E90FF",
"purple3",
"#FFFF19",
"#FF8C1A",
"#77DD37"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:6)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette6[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=6.\n LEA inference for k6 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette6) +
expand_limits(y = c(0, 1.5))
## [1] 5
# Extract ancestry coefficients
leak4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K4/run5/r2_0.1_r5.4.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak4)
## # A tibble: 6 × 4
## X1 X2 X3 X4
## <dbl> <dbl> <dbl> <dbl>
## 1 0.846 0.0642 0.00502 0.0851
## 2 0.934 0.000100 0.00681 0.0588
## 3 1.00 0.000100 0.000100 0.000100
## 4 0.802 0.0554 0.0561 0.0867
## 5 0.838 0.0788 0.00619 0.0774
## 6 0.791 0.0931 0.00477 0.111
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak4 <- leak4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette4 <-
c(
"#1E90FF",
"purple3",
"#77DD37",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:4)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette4[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=4.\n LEA inference for k4 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette4) +
expand_limits(y = c(0, 1.5))
## [1] 5
# Extract ancestry coefficients
leak3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.snmf/K3/run5/r2_0.1_r5.3.Q"),
delim = " ",
col_names = FALSE,
show_col_types = FALSE
)
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.0764 0.0719 0.852
## 2 0.000100 0.0501 0.950
## 3 0.000100 0.000100 1.00
## 4 0.0680 0.122 0.810
## 5 0.0881 0.0694 0.843
## 6 0.107 0.0955 0.798
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette3 <-
c(
"#FF8C1A",
"purple3",
"#1E90FF"
)
# "#77DD37",
# Generate all potential variable names
all_variables <- paste0("v", 1:3)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette3[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n LEA inference for k3 with 47,484 SNPs.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette3) +
expand_limits(y = c(0, 1.5))
ggsave(
here("output", "europe", "lea", "lea_k=3_europe_r2_1_run5.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Overall, the datasets pruned with r2<0.01 and r2<0.1 look very similar. The biggest difference seems to be that some pops, e.g. the 2 in Greece, are somewhat less admixed with the <0.1 set.
Check data - we created 2 vcf files with LD pruning r2<0.01 (LD1, Set1) and r2<0.1 (LD2, Set2) after QC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1_b.vcf
Import the data
genotype <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.vcf"
)
d <- read.vcfR(
genotype
)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 20968
## column count: 419
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 20968
## Character matrix gt cols: 419
## skip: 0
## nrows: 20968
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant: 20968
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ITB ITP ITR KER KRA
## 10 12 12 10 12 13 10 14 12 16 12 12 11 10 4 5 8 12 12 12
## MAL POL POP RAR ROM ROS SCH SER SEV SIC SLO SOC SPB SPC SPM SPS STS TIK TIR TRE
## 12 2 12 12 4 11 5 4 12 9 12 12 8 6 6 8 7 12 4 12
## TUA TUH
## 9 12
Convert format
##
## - number of detected individuals: 410
## - number of detected loci: 20968
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.removed file, for more informations.
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.geno"
##
## - number of detected individuals: 410
## - number of detected loci: 20968
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.removed file, for more informations.
##
##
## - number of detected individuals: 410
## - number of detected loci: 20968
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.lfmm"
PCA
## [1] "******************************"
## [1] " Principal Component Analysis "
## [1] "******************************"
## summary of the options:
##
## -n (number of individuals) 410
## -L (number of loci) 20968
## -K (number of principal components) 410
## -x (genotype file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.lfmm
## -a (eigenvalue file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.pca/r2_0.01_b.eigenvalues
## -e (eigenvector file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.pca/r2_0.01_b.eigenvectors
## -d (standard deviation file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.pca/r2_0.01_b.sdev
## -p (projection file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.pca/r2_0.01_b.projections
## -c data centered
## * pca class *
##
## project directory: /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/
## pca result directory: r2_0.01_b.pca/
## input file: r2_0.01_b.lfmm
## eigenvalue file: r2_0.01_b.eigenvalues
## eigenvector file: r2_0.01_b.eigenvectors
## standard deviation file: r2_0.01_b.sdev
## projection file: r2_0.01_b.projections
## pcaProject file: r2_0.01_b.pcaProject
## number of individuals: 410
## number of loci: 20968
## number of principal components: 410
## centered: TRUE
## scaled: FALSE
Test
## [1] "*******************"
## [1] " Tracy-Widom tests "
## [1] "*******************"
## summary of the options:
##
## -n (number of eigenvalues) 410
## -i (input file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.pca/r2_0.01_b.eigenvalues
## -o (output file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.pca/r2_0.01_b.tracywidom
# tw$pvalues
# plot the percentage of variance explained by each component
plot(tw$percentage, pch = 19, col = "blue", cex = .8)
Get values
# plot preparation
pc.coord <- as.data.frame(pc$projections)
colnames(pc.coord) <- paste0("PC", 1:nPC)
pc.coord$Individual <- inds
pc.coord$Population <- pops
# perc1 <- paste0(round(tw$percentage, digits = 3) * 100, "%")
perc <- paste0(round(pc$eigenvalues/sum(pc$eigenvalues), digits = 3) * 100, "%")
nb.cols <- 40
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
Check R symbols for plot
#to see all shapes -> plot shapes - para escolher os simbolos
N = 100; M = 1000
good.shapes = c(1:25,33:127)
foo = data.frame( x = rnorm(M), y = rnorm(M), s = factor( sample(1:N, M, replace = TRUE) ) )
ggplot(aes(x,y,shape=s ), data=foo ) +
scale_shape_manual(values=good.shapes[1:N]) +
geom_point()
Sample data
## Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1 Berlin, NJ USA 39.79081 -74.9291 Americas BER 2018
## 2 Columbus, OH USA 39.97170 -82.9071 Americas COL 2015
## 3 Palm Beach USA 26.70560 -80.0364 Americas PAL 2018
## 4 Houston, TX USA 29.75491 -95.3505 Americas HOU 2018
## 5 Los Angeles USA 34.05220 -118.2437 Americas LOS 2018
## 6 Manaus, AM Brazil -3.09161 -60.0325 Americas MAU 2017
## Region Subregion order order2 orderold
## 1 North America 1 NA 75
## 2 North America 2 NA 76
## 3 North America 3 NA 77
## 4 North America 4 NA 78
## 5 North America 5 NA 79
## 6 South America 6 NA 80
Check pops
## [1] SOC SOC SOC SOC SOC SOC
## 42 Levels: ALD ALU ALV ARM BAR BRE BUL CES CRO DES FRS GES GRA GRC IMP ... TUH
## [1] 42
Check the regions
## [1] "North America" "South America" "Western Europe" "Southern Europe"
## [5] "Eastern Europe" "East Asia" "South Asia" "Southeast Asia"
## [9] "West Africa" "East Africa" "Indian Ocean" "Central Africa"
## [13] "North Africa" "Caribbean"
Merge
merged_data <- merge(pc.coord, sampling_loc, by.x = "Population", by.y = "Abbreviation")
head(merged_data)
## Population PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 ALD -8.30703 -18.4772 30.7668 8.78149 -8.21542 1.595980 0.252519
## 2 ALD -9.03099 -21.1681 35.0124 7.67807 -6.69699 4.884060 -2.461390
## 3 ALD -7.96804 -26.2619 26.1057 11.40140 -7.78105 7.900910 0.893444
## 4 ALD -7.73401 -23.1584 31.9603 3.41562 -8.60297 -0.778845 -0.818793
## 5 ALD -8.76778 -18.3715 29.5787 2.96283 -8.89836 0.508140 3.863840
## 6 ALD -7.36398 -22.5720 35.1416 8.31307 -7.48994 5.397800 -0.679573
## PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15
## 1 0.836280 7.3385300 -1.75961 1.891960 7.14693 -9.12201 -1.033590 0.561650
## 2 -2.231890 9.0237700 -3.92305 -0.511047 14.05710 -8.66520 -1.769020 3.801980
## 3 -0.370961 5.6920900 -3.38880 0.796323 14.62680 -4.82640 1.842030 0.522097
## 4 -4.197340 6.2208100 -0.23525 -0.923469 1.36430 -1.55826 2.466910 -1.765170
## 5 -1.192480 -0.0215345 3.41123 3.459110 6.95105 -8.86136 -0.123605 0.728926
## 6 1.237470 3.8683500 -3.93310 2.771010 6.80318 -7.32523 -1.235780 2.743930
## PC16 PC17 PC18 PC19 PC20 PC21 PC22
## 1 11.97760 4.883660 2.541660 3.074420 -4.87936 -4.407260 -8.53969
## 2 11.85910 2.533370 -0.423637 -0.192915 -4.21119 -2.866060 -2.92180
## 3 10.60610 0.596138 -1.402300 -0.375270 -7.63234 -2.813950 -6.64685
## 4 0.67315 2.837320 -3.843230 -2.442590 -6.66311 0.303060 9.04182
## 5 9.26730 -3.905150 -11.146100 -1.713140 11.23800 -11.588100 -10.89770
## 6 9.89122 2.555240 -1.105610 4.878290 -7.69483 -0.775554 -8.06874
## PC23 PC24 PC25 PC26 PC27 PC28 PC29
## 1 3.39727 -0.754296 2.4559500 8.614130 -4.53681 4.615520 1.05838
## 2 6.00423 -1.010680 0.2934060 7.698430 3.23317 3.577510 3.11787
## 3 7.79747 -0.992685 -1.2132000 5.613660 6.88347 4.404370 8.71651
## 4 -3.37492 6.365410 -2.2620900 1.638900 8.56862 -0.501335 3.32665
## 5 4.22101 -2.057690 0.0698244 7.969440 -2.01621 1.262440 -5.47148
## 6 6.02060 -2.356960 10.8073000 0.266661 -11.00850 7.841810 -17.34870
## PC30 PC31 PC32 PC33 PC34 PC35 PC36
## 1 2.607280 -0.916739 -10.811600 14.11270 5.09857 16.19990 11.83590
## 2 2.071330 1.262860 -8.514670 8.64555 3.87414 6.37691 10.70870
## 3 -1.192320 1.465310 -2.598990 6.41948 -3.76999 8.75650 8.99059
## 4 -1.330960 0.515621 -6.346730 3.21032 -10.88700 -1.50553 2.33068
## 5 0.389747 -0.804103 0.322069 5.86603 1.78342 4.77765 1.36416
## 6 3.241140 -10.401100 -0.339116 3.63540 2.28672 -2.86063 -5.36387
## PC37 PC38 PC39 PC40 PC41 PC42 PC43 PC44
## 1 -5.546320 2.827020 -1.894770 -7.65558 -3.44706 6.06761 -10.60970 -3.540350
## 2 -1.719440 0.721703 0.589782 -5.79728 0.50878 8.15043 -13.45440 1.151770
## 3 -3.030050 -7.637100 4.802420 -6.93317 3.07902 8.63585 -9.09946 -0.492022
## 4 0.132235 4.678850 -2.188960 -1.13202 -9.81111 2.45240 5.06414 -6.685700
## 5 -6.710530 -1.545920 -2.868050 -1.30747 0.29234 2.00519 -5.64305 -8.185120
## 6 -1.860710 10.404100 1.560770 7.18874 13.09420 -1.33430 -2.09286 -1.145420
## PC45 PC46 PC47 PC48 PC49 PC50 PC51 PC52
## 1 3.69245 7.10187 -0.397589 7.938350 -6.337600 -9.05444 3.80463 -9.74436
## 2 -3.87730 8.72043 -3.058920 6.928460 -0.752548 -4.74792 4.42712 -8.98997
## 3 -5.28467 10.82630 0.310316 10.691000 -3.702660 -1.20415 8.11637 -4.51591
## 4 -6.75802 -4.26397 -5.251790 -0.977606 -8.736760 -9.16398 1.32720 -5.98141
## 5 4.14080 -3.84497 3.782940 8.535900 2.827050 -6.69455 1.49356 -6.51924
## 6 3.40223 -1.63950 13.219600 -13.357100 6.706390 24.40510 -26.25780 -4.83208
## PC53 PC54 PC55 PC56 PC57 PC58 PC59
## 1 -1.881420 -2.450980 3.313480 0.733659 -5.78848 0.175109 0.994373
## 2 -7.122780 -1.816080 -0.428055 1.095460 0.12379 -4.529350 11.438800
## 3 -2.754490 -8.894940 -3.953970 3.263120 1.47633 -7.315220 3.166310
## 4 0.552355 6.881380 -0.419715 -3.615670 7.13267 -7.685110 1.300980
## 5 -2.745990 0.794184 0.875250 -9.532150 1.87714 0.952087 -0.169903
## 6 21.414700 -14.970200 7.962070 -7.832620 -12.80610 16.836300 1.838560
## PC60 PC61 PC62 PC63 PC64 PC65 PC66
## 1 -0.594757 -0.658330 1.46032 -7.622650 5.802850 -3.70347 -4.547970
## 2 -4.335060 0.463289 -3.41587 -5.168530 0.926892 -6.73353 -4.963490
## 3 -7.376750 -2.734040 -4.61548 -0.293254 -7.094120 -7.37651 -13.002600
## 4 -1.573210 1.900620 5.98428 3.555880 6.126250 -8.60803 1.833140
## 5 -4.817860 -1.187880 -2.68886 6.119760 13.385200 2.39779 -0.207561
## 6 -2.341360 8.366280 -5.77262 -3.259350 -8.904470 -10.16390 8.678500
## PC67 PC68 PC69 PC70 PC71 PC72 PC73
## 1 0.805742 4.56723 -0.742004 3.439010 1.207090 -4.56764 -1.87931
## 2 -2.620010 1.70903 0.406734 0.716262 2.170970 8.37895 -4.30759
## 3 -12.913300 2.59735 -2.694340 -13.384700 3.227940 4.28303 -2.08939
## 4 1.043300 -9.28652 -1.254580 -3.067140 -5.645610 7.35801 6.32560
## 5 -4.231150 -1.16911 8.369170 2.810890 -0.266246 11.66030 -9.12075
## 6 -6.004050 8.47308 -4.879820 4.370550 2.957350 8.42037 2.86404
## PC74 PC75 PC76 PC77 PC78 PC79 PC80 PC81
## 1 -2.326950 -4.012010 -8.361660 0.856753 -4.58498 -6.32193 1.291560 -2.475040
## 2 0.104409 -0.289165 -7.004220 7.659700 -2.02616 4.09071 5.641170 2.889420
## 3 0.541529 -2.105990 -9.797030 3.234220 -8.15306 1.16679 -3.314800 5.187850
## 4 -2.320400 1.755670 -5.559560 -4.195270 4.42223 7.37963 -0.145694 0.480200
## 5 5.884880 -0.783922 -1.320120 12.655900 1.40415 6.94514 1.920560 -5.275360
## 6 -7.237220 -5.411570 0.580693 -9.617590 -9.58794 -9.81012 -1.553630 0.440467
## PC82 PC83 PC84 PC85 PC86 PC87 PC88
## 1 1.266480 6.747820 -5.89311 5.982740 -8.86710 0.00327945 -3.1803100
## 2 1.398870 3.109340 -5.71266 1.733720 -4.85998 -2.04706000 3.4852300
## 3 4.472310 0.928489 -12.31610 -0.137084 -1.54383 -3.55737000 -0.2737900
## 4 -9.418110 4.728620 -2.06108 12.784900 7.48513 5.12201000 0.0594954
## 5 -0.464576 0.265168 -6.52731 6.917190 7.19265 -0.14362800 -7.3494700
## 6 3.137300 8.503810 -0.75056 0.371729 -2.46833 -5.37974000 1.2105200
## PC89 PC90 PC91 PC92 PC93 PC94 PC95 PC96
## 1 -1.691900 -3.58961 7.27675 -2.97092 -7.314950 11.96840 -1.83857 -5.26662
## 2 -3.419110 -2.33061 1.44612 -9.84811 -0.839558 -3.11331 -1.15624 2.71095
## 3 -7.546140 -1.56030 -4.75245 -10.83350 -0.450165 -3.64618 -4.72716 6.09859
## 4 11.259000 -8.84193 1.66299 -1.52849 -8.345370 7.44430 -5.84123 6.65331
## 5 -0.410123 3.08059 7.99805 7.36546 6.625750 4.79192 7.53139 -3.51378
## 6 5.334150 -1.23582 6.69017 -6.59163 -1.067390 -7.23939 -2.65574 -3.08715
## PC97 PC98 PC99 PC100 PC101 PC102 PC103
## 1 6.812850 0.880764 2.682910 6.53014 2.068150 0.0205853 5.139570
## 2 -4.574280 -5.160630 -0.560690 10.68670 2.574770 -0.9191500 3.419460
## 3 3.458680 -10.118400 8.794550 7.71880 7.819630 -2.6526200 -7.730530
## 4 -2.600390 4.787620 0.478468 -4.11073 0.749219 3.2537900 -6.061760
## 5 1.280490 -8.386240 10.458200 -5.51831 -2.906730 -2.9117500 -0.192519
## 6 0.409129 9.012290 3.046660 3.68981 1.955450 7.0676100 7.812970
## PC104 PC105 PC106 PC107 PC108 PC109 PC110
## 1 -2.63457 -7.327750 3.67871 0.676518 -1.75800 -3.373660 13.661300
## 2 -2.62409 0.165796 -2.92761 0.744786 -2.34596 -1.547410 5.089160
## 3 -2.32767 -2.872950 -6.91824 11.394700 1.40872 -4.192000 4.660900
## 4 -16.47580 5.651090 -10.69280 -2.787740 -2.70999 0.713378 -2.647890
## 5 14.28910 6.540000 7.24427 -2.179380 -2.57112 -8.443390 0.730188
## 6 2.21214 -1.309410 5.24569 -5.653040 5.69477 1.455020 -2.796800
## PC111 PC112 PC113 PC114 PC115 PC116 PC117
## 1 10.5353000 -9.43850 -3.9915500 -1.13106 1.627450 -0.623789 3.865470
## 2 4.8209300 -2.78654 0.0537052 -4.07770 3.312480 -1.367690 -1.577070
## 3 0.0769366 3.04950 7.2911000 -4.63690 -2.279350 -2.112740 -2.497430
## 4 5.2892400 -1.16340 -10.1630000 2.08629 -2.710540 5.087860 6.420200
## 5 -1.8924500 -1.66273 -1.3751700 3.32388 -0.160393 2.213710 0.455934
## 6 -0.9720430 3.25462 -5.3586800 4.68896 1.792240 -5.866610 -0.170478
## PC118 PC119 PC120 PC121 PC122 PC123 PC124
## 1 -2.38239 7.73532 -6.17957 2.0997800 0.788929 9.623330 -2.316800
## 2 3.17677 -2.28733 1.03354 -1.5338800 -4.277440 6.726070 -0.857729
## 3 -1.04739 -5.57028 2.75290 -14.1120000 3.328420 -8.005040 3.524810
## 4 3.44547 7.89935 -3.00594 0.0576298 -2.954720 -11.447900 -7.184660
## 5 -3.35853 -6.45348 2.07303 -3.4198800 0.987759 6.834250 0.897731
## 6 -1.13715 5.62298 1.56556 -0.6944430 7.003260 0.384027 -1.152630
## PC125 PC126 PC127 PC128 PC129 PC130 PC131
## 1 5.297400 -8.63745 8.8065300 -8.14982 3.36255 -0.595500 -0.202445
## 2 -0.192643 -6.49742 10.1149000 7.31072 1.69932 0.509151 -6.458760
## 3 -1.253180 5.75523 -4.7447600 11.54540 1.75188 -0.940111 1.588950
## 4 -1.128020 -7.04889 -0.0755007 -13.69490 3.62660 -3.458370 -6.725740
## 5 -3.999220 6.22512 -7.0667600 -15.53610 13.69010 -4.272470 3.301600
## 6 -3.582370 4.78460 -0.2907040 -3.65475 -1.51422 -3.842990 2.046260
## PC132 PC133 PC134 PC135 PC136 PC137 PC138
## 1 -1.93790 5.13208 -2.155120 1.588080 0.538063 0.399116 -3.229040
## 2 -11.95650 -0.23188 -1.435690 0.300436 -8.040620 -0.782967 -2.341400
## 3 -1.16137 -2.63657 2.081280 -5.504730 -4.364550 1.501750 -3.819520
## 4 -6.38512 1.49440 -0.753075 2.961490 -4.984520 -0.945520 0.628155
## 5 -13.29020 -10.02400 5.256290 9.962230 -1.605070 -6.429670 8.862630
## 6 -3.81668 4.17271 -4.677230 -1.356480 4.898140 -3.943950 7.079680
## PC139 PC140 PC141 PC142 PC143 PC144 PC145
## 1 -0.768417 2.371470 0.351296 -0.460094 7.3834300 1.93106 -9.89451
## 2 5.020500 -10.508700 6.387960 2.771520 5.8927800 -5.94347 7.30664
## 3 7.991280 -12.161000 2.872720 -3.979150 -6.3592200 -1.35357 2.23241
## 4 2.294920 0.479558 1.707550 -4.770980 -0.0757234 -9.27704 1.44833
## 5 2.278010 2.083390 -8.310390 6.758380 1.4211300 -4.49196 1.91292
## 6 1.225440 -4.150540 -1.546730 -0.208617 2.8035800 -4.81814 6.03824
## PC146 PC147 PC148 PC149 PC150 PC151 PC152
## 1 1.29757 2.99257 4.3956500 1.250750 -5.47666 3.534580 -3.386850
## 2 -2.27101 -3.87441 -1.4353200 -0.569839 -4.11697 -2.183110 -4.837810
## 3 -3.97272 3.38396 0.8898280 -2.476430 7.48860 1.245540 -0.765761
## 4 -7.45126 -9.64670 0.0598099 -4.892330 -5.78175 0.203226 -7.554590
## 5 7.63676 -18.06110 -4.6294000 7.126750 5.68241 -0.619642 -0.234867
## 6 2.52416 2.41677 1.5705600 1.852090 -4.17679 -3.288790 -4.488840
## PC153 PC154 PC155 PC156 PC157 PC158 PC159
## 1 1.29337 9.43018 -4.602950 0.710417 -0.783431 -1.536970 -7.52020
## 2 -8.09352 -4.25492 9.978990 3.808930 9.105600 -6.513940 0.55890
## 3 5.29973 -4.88733 -7.820100 -7.852140 3.951220 -1.976630 1.60090
## 4 3.83714 15.83240 13.251200 5.637840 -7.876060 -0.137528 -1.34140
## 5 -10.75100 5.15958 -0.877378 6.492900 -7.584120 -1.150000 2.60864
## 6 2.97514 -2.80829 1.447280 4.601170 -1.024240 2.324360 -1.13972
## PC160 PC161 PC162 PC163 PC164 PC165 PC166 PC167
## 1 4.0332500 5.74444 1.628920 -8.09469 -3.05901 10.009700 -8.283100 3.16919
## 2 0.0573239 5.14853 2.163130 -13.06480 4.92035 1.616900 1.923670 10.58470
## 3 -2.8409000 1.77252 0.666614 6.45610 4.48876 -5.061620 -0.235488 -8.13959
## 4 1.9906500 3.81050 -15.595500 3.14852 9.33174 -3.234220 -9.674420 1.98107
## 5 1.8783100 4.21335 4.280360 7.53852 2.60141 15.444500 2.751800 -0.04802
## 6 -0.9652350 4.70265 4.067510 4.66024 -8.07833 0.502014 -1.898540 -2.08536
## PC168 PC169 PC170 PC171 PC172 PC173 PC174
## 1 11.357500 0.395458 -5.188560 -9.90259 -0.317008 2.986880 -4.542210
## 2 0.977356 -1.387860 4.879300 2.04584 5.282630 -2.527200 0.716552
## 3 0.369022 4.396700 -1.277860 -5.51953 -6.942540 -0.043279 -8.200440
## 4 1.292140 -4.872830 -0.639078 5.16690 3.049450 1.049170 9.961350
## 5 3.239250 -10.414200 3.318010 8.47382 2.931790 9.753860 7.968860
## 6 -1.236760 3.985260 0.222380 2.75292 1.463900 -2.815040 -0.253820
## PC175 PC176 PC177 PC178 PC179 PC180 PC181
## 1 4.041960 -22.23360 0.77378100 -0.93542 -9.237420 1.207400 3.053760
## 2 -0.398623 -5.61680 0.02338490 1.82526 4.054790 -8.700830 7.188880
## 3 -5.911570 8.41548 -0.00106676 -4.37380 0.934718 0.237307 1.989540
## 4 1.315610 -3.93551 10.31630000 2.31800 1.979610 8.446500 0.445591
## 5 -15.540500 1.29593 2.93206000 -3.71748 -2.330590 8.405490 -0.538828
## 6 0.733629 1.35877 3.70396000 3.16496 -2.229570 0.825850 -2.416160
## PC182 PC183 PC184 PC185 PC186 PC187 PC188
## 1 0.659074 -0.260192 5.357160 -0.512755 2.115470 -4.20397 4.497400
## 2 10.300600 -4.115260 4.611330 9.933690 -2.923630 -7.96074 -0.861876
## 3 -7.556540 1.488780 -9.412400 0.213330 4.279840 2.67380 -2.243240
## 4 1.390060 -5.350430 1.914540 2.249300 -3.448830 -1.58122 -6.771840
## 5 -5.592570 3.943740 -2.766280 -2.757710 0.465282 2.87782 2.928850
## 6 -1.459380 -0.582633 -0.215533 -1.130570 1.300780 3.94880 -4.110140
## PC189 PC190 PC191 PC192 PC193 PC194 PC195
## 1 -0.187531 3.561120 -3.914720 -11.437600 1.25344 -5.091250 -2.118130
## 2 0.313349 -2.497010 0.654489 -2.097760 -1.99603 -11.364000 -6.592900
## 3 -5.482010 0.683750 1.950300 -6.010300 8.01753 -0.963021 -2.219790
## 4 5.241260 1.982130 0.355849 0.370579 10.86460 -5.322950 0.117172
## 5 1.551820 1.066270 3.417200 7.724190 -1.09099 5.184030 -0.926992
## 6 2.090420 0.957614 3.474180 -3.952960 -1.09865 -1.087650 2.863520
## PC196 PC197 PC198 PC199 PC200 PC201 PC202 PC203
## 1 3.11881 1.24308 4.99088 -3.25522 8.270720 -1.451320 -5.84496 -5.68486
## 2 -1.96896 -11.67670 0.50075 6.06616 -0.334791 0.149688 -1.10820 14.24390
## 3 5.93215 4.63167 2.88083 1.47843 9.281540 -0.823894 -1.53547 -6.71447
## 4 -4.66076 -1.10074 -3.58025 -3.53704 -5.512330 3.876910 3.11157 1.26578
## 5 -2.71321 10.38620 2.59098 1.91733 0.293909 1.152530 -1.26362 -15.20880
## 6 -1.69820 -1.04243 -2.68786 -1.07191 -3.209370 2.182440 -3.26430 1.50874
## PC204 PC205 PC206 PC207 PC208 PC209 PC210
## 1 7.599140 -0.4890700 2.846520 2.848800 0.852770 4.583700 7.06682
## 2 -7.859010 0.0652980 1.580670 0.607973 4.598860 3.536750 6.71509
## 3 0.414647 -0.0577861 -0.580652 7.225000 -0.269713 -0.313945 1.66747
## 4 4.204550 -3.5516200 -0.685287 -1.096810 3.714720 -4.764210 -3.01243
## 5 11.264700 3.7842300 -1.525110 1.382010 -5.277600 10.737000 -3.18022
## 6 -2.626010 -1.6572900 -3.343560 4.994020 -1.333390 1.243420 -1.07813
## PC211 PC212 PC213 PC214 PC215 PC216 PC217 PC218
## 1 17.262600 -4.67515 5.81610 6.917400 6.766540 5.75890 4.56392 3.302770
## 2 -0.592453 -3.25361 8.10363 -7.933150 -2.520040 -4.78296 -0.32545 -9.533720
## 3 -3.969070 -7.58948 -1.65366 -0.259271 0.440712 -1.10663 -1.42543 -8.916220
## 4 -0.357485 -2.89218 3.43647 1.883470 -6.076450 2.87988 -8.32774 -0.225754
## 5 0.482577 3.58031 4.78279 7.992950 -8.258300 -2.86669 -0.74211 2.597930
## 6 -4.186930 -3.10714 -2.50248 -1.054210 -1.829190 -2.55706 -3.57554 3.287700
## PC219 PC220 PC221 PC222 PC223 PC224 PC225 PC226
## 1 0.692377 -5.69863 1.852700 -7.296880 -3.0161300 0.264491 -2.42765 3.38838
## 2 -0.167453 -2.52099 -8.710390 0.039052 2.0521200 -1.573720 -9.15385 -1.35060
## 3 2.434460 0.24178 -3.315350 -4.016790 6.5261200 0.538950 -1.92935 2.73615
## 4 -6.851560 4.80440 3.888690 -7.579110 -8.0972300 8.662160 8.16326 2.00011
## 5 -5.765780 2.30679 6.722870 4.093930 1.6351800 -5.545000 1.26107 3.82637
## 6 0.670042 1.30095 0.771785 -1.799720 -0.0599502 0.566037 4.56907 -1.12163
## PC227 PC228 PC229 PC230 PC231 PC232 PC233 PC234
## 1 7.44992 1.71086 4.63983 -2.01706 -10.604200 10.333000 -8.584510 -4.05353
## 2 6.86345 -6.30149 11.67920 4.70546 3.375070 -1.373950 -6.440960 -1.43285
## 3 1.19881 -6.13999 1.30273 -4.98056 -1.992480 0.634851 -6.291200 1.05093
## 4 -6.52240 7.29628 1.14370 1.96909 10.421300 5.070300 -7.247220 -9.06312
## 5 -1.56437 -3.44080 -2.30638 7.98918 -2.478310 0.730803 4.645200 8.77441
## 6 -1.35819 -3.37790 -1.20313 2.48302 0.656529 2.383880 0.454615 1.42390
## PC235 PC236 PC237 PC238 PC239 PC240 PC241 PC242
## 1 8.705200 4.254190 -1.17863 1.69888 -2.2701300 -7.20195 -4.15221 0.703490
## 2 1.392930 3.666630 1.80231 -2.45670 0.5755290 10.32070 1.92375 -0.921910
## 3 6.068800 2.439860 2.67442 -4.99648 1.0818000 1.07930 5.15830 -0.255426
## 4 0.286896 -3.567090 -2.19040 3.24737 0.0766803 2.49070 3.66431 -6.313250
## 5 -1.046130 6.509140 -1.92298 -3.39888 -2.3786000 3.87036 -8.13496 -0.346266
## 6 0.382214 -0.341849 2.91359 -2.80606 -1.2785300 1.72237 2.34043 -3.137690
## PC243 PC244 PC245 PC246 PC247 PC248 PC249 PC250
## 1 -4.869300 -1.78775 8.335620 -0.451932 -7.91832 1.3842800 -7.46858 -0.338846
## 2 -2.134910 6.82829 0.764576 -4.371630 4.04053 -3.5030400 6.11507 -8.544270
## 3 -2.624630 2.72871 2.761050 3.147980 4.91947 7.7854100 -3.44450 6.625540
## 4 1.767870 2.30145 0.799496 5.211330 4.54395 1.1262200 -1.16183 -4.951270
## 5 -0.479501 -2.01449 -6.638740 -2.421500 -1.64682 3.4502800 9.42246 -0.615166
## 6 -0.585581 2.04106 -0.235718 -4.161950 1.89633 0.0237993 1.81597 -0.654947
## PC251 PC252 PC253 PC254 PC255 PC256 PC257
## 1 -4.650340 -2.023780 2.620640 3.676560 2.39770 12.9307000 -5.38963
## 2 7.551510 5.516070 -2.361060 0.654199 -4.71135 -3.6169500 4.60843
## 3 -9.424370 0.388282 -2.679660 -6.162530 5.45146 -0.6355940 -1.20924
## 4 -1.374170 3.651500 6.486900 2.270000 1.35640 1.8314400 3.87046
## 5 -1.684960 2.989500 0.839364 10.237400 3.62157 0.0595488 2.09031
## 6 -0.549751 2.195180 1.294220 1.765400 3.40167 -0.9153670 1.49592
## PC258 PC259 PC260 PC261 PC262 PC263 PC264
## 1 -4.979270 5.109740 -2.331330 6.834190 -0.274926 7.421500 6.11025
## 2 6.557060 -1.608890 -3.074850 -6.148390 6.760350 -1.000370 -8.14170
## 3 -0.813034 4.006600 -3.652450 2.137300 1.458280 0.192966 2.68077
## 4 0.886108 0.869854 7.937430 -3.747950 -0.502031 -1.799090 3.25296
## 5 11.112000 1.653780 3.413520 3.055280 -1.708960 7.025120 -3.74390
## 6 -1.657230 0.552206 -0.899609 -0.858042 1.089420 -1.827910 -2.43423
## PC265 PC266 PC267 PC268 PC269 PC270 PC271
## 1 5.32256 7.939550 6.495200 1.953540 -4.655260 -0.977609 12.078300
## 2 5.99707 -1.166110 -2.867990 -2.267450 8.229870 -1.413310 -5.985430
## 3 -2.00810 -4.942130 -3.400940 -3.032430 -6.644150 4.861830 2.857410
## 4 -14.71410 -8.144690 -0.413657 0.690066 -7.271240 6.026970 -5.664640
## 5 -3.29267 0.626529 -1.002230 -1.488080 1.410530 -1.781240 -4.620030
## 6 2.24657 -0.119322 -2.362640 -0.468351 0.610115 1.409400 0.323562
## PC272 PC273 PC274 PC275 PC276 PC277 PC278
## 1 -0.910335 -1.806210 1.873230 -2.538080 -2.43342 4.741830 -1.2961000
## 2 -3.000510 2.047340 -10.367600 6.577390 7.94295 -5.500160 -3.7286400
## 3 1.037970 0.355473 -7.162250 -0.421809 -5.61027 -4.704570 -0.0877855
## 4 -5.868880 -1.507780 -0.342013 -1.749640 -6.94028 8.331200 4.7158500
## 5 -1.809300 -0.243658 -1.637540 5.529760 10.23170 -0.837563 -4.7368600
## 6 -1.919400 3.344320 -2.278050 -2.165890 1.54627 -0.884273 4.3105300
## PC279 PC280 PC281 PC282 PC283 PC284 PC285
## 1 -5.82474 -1.419010 8.29881 -8.116370 -7.848090 0.2436540 3.836820
## 2 5.77667 -6.352180 -11.21280 0.798228 6.493610 3.1399400 -3.076020
## 3 3.26714 -0.534761 -1.82046 5.921090 1.933640 -3.0718700 2.021510
## 4 -2.60820 2.541770 2.67376 8.067010 0.296274 -0.0677794 -3.007980
## 5 2.21890 -2.662070 5.24034 -1.548830 -1.394500 2.0120700 0.460678
## 6 1.85458 -0.801299 -2.44189 -1.714900 -1.442040 -4.6164500 -0.533363
## PC286 PC287 PC288 PC289 PC290 PC291 PC292
## 1 -1.107150 5.02363 4.868420 -1.551890 -4.384130 -0.924084 4.88828
## 2 1.040610 -4.26538 -6.666820 -8.361290 4.005870 2.312520 7.81958
## 3 -4.947770 -5.85838 6.707720 0.357686 2.134500 -4.326060 -10.03790
## 4 -3.184640 -7.53130 -0.604055 1.934400 -7.573420 -4.788370 -5.43548
## 5 4.789110 -2.99396 -8.025670 8.783390 0.391218 3.014200 2.75240
## 6 -0.166169 -0.37667 0.582729 -0.887963 1.730280 0.509809 -1.72821
## PC293 PC294 PC295 PC296 PC297 PC298 PC299
## 1 0.536126 0.0550503 3.41441 -2.708390 0.7208130 -1.3235200 -3.59956
## 2 -0.243326 5.5835000 -8.63898 -4.624690 -6.9552800 6.4001300 5.51019
## 3 -9.163130 -2.6650300 6.36658 -1.185990 -0.0281238 -4.1535700 -4.62046
## 4 -2.181070 -2.9627300 1.99134 5.160160 2.1360400 0.4541610 5.05063
## 5 2.213260 2.1366100 -4.12150 -0.853176 1.1305500 -2.4886100 -2.40447
## 6 -0.563275 -0.9066620 1.15948 0.513928 -0.0372562 -0.0976939 0.32065
## PC300 PC301 PC302 PC303 PC304 PC305 PC306
## 1 0.2186940 -1.453630 -2.992980 -2.909680 4.2545900 -0.645737 -3.76026
## 2 1.8141200 2.024630 5.214790 2.797050 0.0608232 -6.055850 -4.09893
## 3 1.1563300 -0.319754 -3.398020 0.321438 -3.5860100 11.283600 10.07550
## 4 1.8365800 4.552550 3.770630 5.100650 0.6386690 -4.081290 -2.52254
## 5 -0.0809592 4.322960 -2.153850 3.069390 -1.2942200 5.258960 5.97411
## 6 -1.0629100 1.128340 0.468416 -1.561140 -0.5808200 -0.761010 -2.21337
## PC307 PC308 PC309 PC310 PC311 PC312 PC313
## 1 0.244887 6.71885 -2.3701300 1.976050 -4.925400 -7.904390 -1.164800
## 2 -1.266530 3.93018 -2.6318200 -0.645999 4.522350 1.874400 0.223717
## 3 1.797470 -11.44510 1.0609300 -6.173900 0.577299 8.867050 0.957311
## 4 2.174390 6.12426 6.9768500 0.866727 4.245680 2.225950 -5.645720
## 5 -0.770026 -4.78545 -1.6633700 -0.447816 2.733510 2.797620 1.978680
## 6 1.244560 -1.31937 -0.0931785 -1.112110 0.890418 0.627558 -1.684060
## PC314 PC315 PC316 PC317 PC318 PC319 PC320
## 1 -4.7039000 -3.551550 1.452120 -0.321486 1.44137 -1.701820 -2.659050
## 2 8.7279100 -0.923103 -4.274110 0.958430 1.34064 1.583010 1.275160
## 3 3.1664800 5.226240 9.531480 -4.667270 -2.66550 -5.783940 1.093020
## 4 0.5179370 7.155980 -0.310229 -1.736900 1.96646 -3.205700 -3.507880
## 5 -0.0297341 -3.317740 -2.514360 -1.962830 1.75362 -1.249770 1.992600
## 6 -1.9012200 1.492600 -0.121162 -1.477160 3.00940 -0.135196 0.304545
## PC321 PC322 PC323 PC324 PC325 PC326 PC327
## 1 0.135243000 0.585072 -0.281409 -2.12574 0.0345906 -1.321390 2.04231000
## 2 -0.000559001 -0.706043 3.055160 5.93449 3.2299300 3.721230 1.31495000
## 3 -2.192420000 0.702404 -8.126650 -1.58074 -0.1503090 -4.050220 -7.70272000
## 4 -3.608300000 -2.340140 -0.585046 -3.85943 -5.4179500 2.812440 -0.95259600
## 5 5.060540000 0.450056 -3.984820 2.73854 2.3490100 -0.440894 -1.32015000
## 6 -0.608329000 -1.296710 0.717399 1.30489 0.1741800 -0.982965 -0.00594356
## PC328 PC329 PC330 PC331 PC332 PC333 PC334 PC335
## 1 3.39907 -1.385650 -0.262932 3.019960 -1.25052 -2.34081 0.624894 -1.0938100
## 2 -5.97622 -0.681410 -0.929918 -3.425650 4.69072 -2.24299 0.866910 3.7846600
## 3 5.68392 -0.805024 -3.936020 2.197850 -7.42237 7.74920 -1.672690 -0.3279200
## 4 -1.56369 0.907668 -1.020690 -2.848570 -3.97086 3.35699 -0.509832 0.0536633
## 5 5.04479 3.912290 1.425940 0.447452 -1.40157 3.85317 -1.844840 0.8988840
## 6 1.80299 -0.930084 1.339560 -0.889215 -0.18849 2.98046 1.139570 -1.1487800
## PC336 PC337 PC338 PC339 PC340 PC341 PC342
## 1 2.075680 -2.127130 -0.170296 -2.740140 0.469104 -1.44840000 1.195470
## 2 1.478030 -2.112780 -0.196086 -1.147750 1.660270 1.70107000 -0.967533
## 3 1.255840 6.075310 -4.115160 3.555280 -8.260140 0.00922786 -0.507900
## 4 -2.763500 5.402710 -1.923560 1.131050 -7.134350 0.52923700 -2.741820
## 5 0.560885 -2.465860 -2.090550 -0.340069 -2.630140 0.51994400 0.634572
## 6 -0.700027 -0.121861 -0.555715 -0.176553 -0.905249 -1.21157000 1.604980
## PC343 PC344 PC345 PC346 PC347 PC348 PC349
## 1 -2.323010 -1.0121100 -0.1830280 -0.290680 1.551550 -2.467190 1.122570
## 2 2.808910 0.7845210 2.6684300 0.582590 -1.406260 1.823350 0.867672
## 3 -1.214040 0.8489200 0.1990410 -1.370200 -1.319370 2.339740 0.183645
## 4 -3.267440 -0.0508354 -1.4653700 0.781824 0.681248 -0.736468 -0.589202
## 5 0.643813 3.4088000 1.8794800 3.022270 -0.635761 1.211430 0.143643
## 6 0.955930 0.9965580 -0.0890403 -0.252370 0.266181 -0.458287 -0.820746
## PC350 PC351 PC352 PC353 PC354 PC355 PC356
## 1 -0.709033 1.453890 -1.4312800 -0.690181 0.797093 -2.333530 -0.6077790
## 2 0.809768 1.705240 1.3342900 -1.118480 -0.812448 1.017340 0.0428385
## 3 1.983570 -2.262320 -1.1798500 -1.872390 -3.243960 -1.010090 1.3243600
## 4 -1.368700 -0.887185 0.6656390 -0.353669 -0.142433 1.019170 1.9231200
## 5 -1.331180 -1.364050 -2.2522900 -2.461810 -0.349798 -1.149460 1.7929800
## 6 1.669300 -0.419366 -0.0092544 0.274321 -0.994408 -0.315862 -1.1322900
## PC357 PC358 PC359 PC360 PC361 PC362 PC363
## 1 0.115936 0.519136 0.460040 -0.367271 -0.967409 -0.134231 -0.690589
## 2 0.793664 -1.217370 0.260045 2.048240 -0.134776 0.770359 -1.757710
## 3 -0.263245 -1.006710 -0.497006 -1.885360 0.551837 -0.683457 2.275790
## 4 -0.200201 0.553114 -2.763210 0.337401 -1.046460 0.274091 0.165550
## 5 1.031720 -0.679207 0.654605 0.944570 -1.074490 1.663520 -0.185349
## 6 0.243332 -1.612620 -0.540859 1.485640 0.311318 0.923339 -0.177838
## PC364 PC365 PC366 PC367 PC368 PC369 PC370
## 1 -1.3545300 -0.239697 -0.755382 -0.868189 -1.11042000 0.5904210 1.3528200
## 2 1.6543000 0.558418 0.202779 0.517719 1.62765000 -0.9450570 0.3727300
## 3 1.6294400 -2.030210 0.182973 0.731053 -0.26230700 0.8984660 -1.6264800
## 4 0.1314790 -1.079810 -0.177486 -0.626611 1.17041000 1.1867000 0.0602634
## 5 0.5421790 -0.713495 0.487598 0.463972 -1.67800000 0.0318053 -0.5494110
## 6 -0.0967296 -1.784820 -0.269995 1.577560 -0.00951893 1.5399400 0.7374750
## PC371 PC372 PC373 PC374 PC375 PC376 PC377
## 1 -0.141998 0.0332270 1.335640 -0.915889 -0.19584000 -0.823390 -0.422008
## 2 -0.736130 -0.0479299 -0.421334 0.447563 0.62618400 0.430809 -0.984879
## 3 0.447691 -0.6875390 1.160690 -0.955596 -0.23514100 0.636300 0.703103
## 4 0.327109 0.2110810 -0.385925 0.488643 0.00343022 1.018940 -0.990748
## 5 0.649439 0.9508460 1.090140 -0.627935 0.35695600 0.752857 0.487994
## 6 -1.326750 2.4866600 -0.441754 0.174908 -1.44478000 -1.077970 0.944822
## PC378 PC379 PC380 PC381 PC382 PC383 PC384
## 1 -0.510790 0.432559 0.683549 0.159685 0.639761 0.0358949 0.0986296
## 2 -0.382882 -0.844789 -0.983589 -0.592047 -0.138335 0.2256740 0.0792632
## 3 1.048070 0.457638 -1.547170 0.198258 -0.740662 -0.6217750 0.5673660
## 4 1.931920 1.296460 0.220242 1.085230 0.154463 0.0908297 0.3935700
## 5 -0.234562 -0.116450 0.481285 0.500762 -0.908509 -0.3187640 -0.0627536
## 6 -0.601723 -1.354920 1.851500 2.001680 1.289720 -3.2342200 1.3174400
## PC385 PC386 PC387 PC388 PC389 PC390 PC391
## 1 -0.898909000 1.044840 0.263129 0.61974000 -0.029580 1.0168800 0.0815222
## 2 -0.281327000 -0.313312 -0.113803 -0.02377540 0.403726 -0.8890920 -0.6996260
## 3 1.285490000 -1.089040 -1.106970 0.18384400 -0.552504 -0.0131367 0.1855960
## 4 0.125072000 -0.241637 0.588003 -0.25243700 0.226507 1.0753600 0.1860600
## 5 0.000334268 -1.341350 0.654062 0.00591793 0.311428 0.3174380 -0.4015010
## 6 0.295576000 0.994932 -0.592279 -7.51061000 0.169847 -0.0960730 7.1717300
## PC392 PC393 PC394 PC395 PC396 PC397 PC398
## 1 0.225875 0.600332 0.8254670 -0.6197690 0.2708750 -0.0465929 -0.0944115
## 2 -0.883643 -1.492620 -0.2098680 0.0210525 0.0627222 0.4162110 0.2761900
## 3 -0.360569 -0.400807 -0.6254560 -0.1698280 -0.7375410 0.0376595 0.3465290
## 4 0.260933 0.568228 0.0217204 -0.2061950 -0.1183380 -0.6470880 0.1663270
## 5 -0.599635 -0.868773 -0.2069020 0.4217340 -0.1576590 0.8492130 0.1079840
## 6 9.569070 22.947800 13.6179000 2.8663700 -0.6742250 3.1309700 -4.1101100
## PC399 PC400 PC401 PC402 PC403 PC404 PC405
## 1 0.413183 -0.19409600 -0.0766816 -0.1177990 0.0299216 0.0437928 0.2429050
## 2 -0.542779 -0.44524800 0.0787651 -0.2438420 -0.3233910 -0.4778800 -0.0218900
## 3 -0.761792 0.43454300 0.3732340 -0.3723580 -0.1476480 0.5908010 -0.0548901
## 4 -0.111911 -0.00628506 -0.4629060 0.1780760 -0.5087130 0.1659680 -0.1389240
## 5 0.413620 -0.38755400 -0.1315430 0.0859722 0.6270870 0.5043940 0.0269752
## 6 1.648010 -1.59798000 -6.9685600 6.3283200 -0.7779730 -0.1180150 -0.5959370
## PC406 PC407 PC408 PC409 PC410 Individual Pop_City
## 1 0.193157 -0.3313060 0.1640620 -0.1722780 6.70906e-07 801 Durres
## 2 -0.068859 -0.2049960 -0.3082120 -0.0175014 6.70906e-07 802 Durres
## 3 0.312284 0.2723950 0.3143880 0.2047130 6.70906e-07 803 Durres
## 4 -0.240833 -0.0143634 0.0239198 -0.0756231 6.70906e-07 804 Durres
## 5 -0.207433 -0.0744912 0.0200505 -0.0827959 6.70906e-07 805 Durres
## 6 -0.468115 0.4927270 0.2489820 0.0354084 6.70906e-07 806 Durres
## Country Latitude Longitude Continent Year Region Subregion order
## 1 Albania 41.29704 19.50373 Europe 2018 Southern Europe East Europe 33
## 2 Albania 41.29704 19.50373 Europe 2018 Southern Europe East Europe 33
## 3 Albania 41.29704 19.50373 Europe 2018 Southern Europe East Europe 33
## 4 Albania 41.29704 19.50373 Europe 2018 Southern Europe East Europe 33
## 5 Albania 41.29704 19.50373 Europe 2018 Southern Europe East Europe 33
## 6 Albania 41.29704 19.50373 Europe 2018 Southern Europe East Europe 33
## order2 orderold
## 1 25 25
## 2 25 25
## 3 25 25
## 4 25 25
## 5 25 25
## 6 25 25
We will do 5 repetitions
# set output dir
# main options
# K = number of ancestral populations
# entropy = TRUE computes the cross-entropy criterion, # CPU = 4 is the number of CPU used (hidden input) project = NULL
project = snmf(
genotype,
K = 1:25,
project = "new",
repetitions = 5,
percentage = 0.25,
iterations = 500,
CPU = 10,
entropy = TRUE
)
project = load.snmfProject("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmfProject")
Cross entropy
# Open a new pdf file
pdf(here("output","europe","lea","lea_cross_entropy_europe_r01b.pdf"), width = 6, height = 4)
# Create your plot
plot(project, col = "blue", pch = 19, cex = 1.2)
# Close the pdf file
dev.off()
## png
## 2
Summary of project
## $repetitions
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy 5 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5 5
## K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
## K = 18 K = 19 K = 20 K = 21 K = 22 K = 23 K = 24 K = 25
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
##
## $crossEntropy
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7
## min 0.9336329 0.9163062 0.9060912 0.9026955 0.8985523 0.8959722 0.8937952
## mean 0.9343860 0.9171151 0.9068816 0.9033719 0.8998511 0.8966561 0.8947156
## max 0.9348277 0.9175281 0.9073800 0.9037948 0.9010259 0.8970960 0.8951899
## K = 8 K = 9 K = 10 K = 11 K = 12 K = 13 K = 14
## min 0.8925808 0.8908072 0.8894142 0.8887549 0.8882141 0.8881417 0.8870224
## mean 0.8934125 0.8917541 0.8904665 0.8896534 0.8892261 0.8884132 0.8878175
## max 0.8937800 0.8923090 0.8917745 0.8901307 0.8899716 0.8890740 0.8885704
## K = 15 K = 16 K = 17 K = 18 K = 19 K = 20 K = 21
## min 0.8875950 0.8870776 0.8874030 0.8874876 0.8882167 0.8881539 0.8890307
## mean 0.8880554 0.8881522 0.8884417 0.8881928 0.8887412 0.8888486 0.8896989
## max 0.8885403 0.8886712 0.8894964 0.8888051 0.8892527 0.8895587 0.8903120
## K = 22 K = 23 K = 24 K = 25
## min 0.8897467 0.8897551 0.8899381 0.8910612
## mean 0.8902147 0.8904257 0.8909128 0.8922199
## max 0.8910897 0.8918061 0.8932039 0.8940502
# get the cross-entropy of all runs for K = 15
ce15 = cross.entropy(project, K = 15)
ce15 #run 5 is best for k=15
## K = 15
## run 1 0.8885403
## run 2 0.8884533
## run 3 0.8880555
## run 4 0.8876327
## run 5 0.8875950
k=14 appears to have the lowest Cross-entropy
# get the cross-entropy of all runs for K = 14
ce14 = cross.entropy(project, K = 14)
ce14 #run 5 is best for k=14
## K = 14
## run 1 0.8879488
## run 2 0.8885704
## run 3 0.8881335
## run 4 0.8874123
## run 5 0.8870224
color_palette_14 <-
c(
"chocolate4",
"blue",
"#B20CC9",
"#1E90FF",
"green",
"#B22222",
"#77DD77",
"purple",
"#F49AC2",
"green4",
"purple4",
"#008080",
"yellow2",
"#FF8C1A"
)
Mean admixture by country using ggplot
best = which.min(cross.entropy(project, K = 14)) #5
sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
sampling_loc$Abbreviation[sampling_loc$Abbreviation == 'SCH'] <- 'STS'
library(reshape2)
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 14, run = best))
# Create a named vector to map countries to regions
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
Q_values$pop [Q_values$pop == 'SCH'] <- 'STS'
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:14)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_14[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=14.") +
# scale_fill_manual(values = color) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) + # Remove Region prefix from labels
scale_fill_manual(values = color_palette_14)
ggsave(
here("output", "europe", "lea", "MAF_1", "LEA_admixture_by_country_europe_k14_r01_MAF1.pdf"
),
width = 10,
height = 7,
units = "in"
)
## [1] 5
change to correct matrix
leak14 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K14/run5/r2_0.01_b_r5.14.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak14)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.000100 0.00295 0.334 3.91e-1 1.00e-4 3.35e-3 1.00e-4 1.00e-4 5.59e-3
## 2 0.0000999 0.0000999 0.671 7.74e-2 4.12e-3 8.39e-3 9.99e-5 9.99e-5 9.99e-5
## 3 0.0000999 0.0000999 0.0000999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.00771 0.00836 0.472 3.72e-1 1.00e-4 1.00e-4 1.16e-2 2.74e-2 1.40e-2
## 5 0.0200 0.0628 0.445 4.08e-1 9.99e-5 2.76e-3 1.12e-2 9.99e-5 9.99e-5
## 6 0.00749 0.0207 0.289 2.90e-1 9.99e-5 9.99e-5 9.99e-5 9.99e-5 1.91e-2
## # ℹ 5 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 9.99640e-05 2.94875e-03 3.33670e-01 3.90726e-01 9.99640e-05
## 2 1066 SOC 9.99280e-05 9.99280e-05 6.70722e-01 7.74045e-02 4.12325e-03
## 3 1067 SOC 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05
## 4 1068 SOC 7.70742e-03 8.36124e-03 4.72139e-01 3.71933e-01 9.99550e-05
## 5 1069 SOC 1.99983e-02 6.28003e-02 4.45023e-01 4.07885e-01 9.99369e-05
## 6 1070 SOC 7.49068e-03 2.06640e-02 2.88870e-01 2.90137e-01 9.99460e-05
## X6 X7 X8 X9 X10 X11
## 1 3.34744e-03 9.99640e-05 9.99640e-05 5.59063e-03 4.51920e-03 1.08335e-01
## 2 8.39170e-03 9.99280e-05 9.99280e-05 9.99280e-05 9.99280e-05 9.99280e-05
## 3 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05
## 4 9.99550e-05 1.15968e-02 2.73743e-02 1.40145e-02 9.99550e-05 9.99550e-05
## 5 2.75847e-03 1.11527e-02 9.99369e-05 9.99369e-05 9.99369e-05 4.96825e-02
## 6 9.99460e-05 9.99460e-05 9.99460e-05 1.91483e-02 9.99460e-05 1.12750e-01
## X12 X13 X14
## 1 1.15669e-01 8.83548e-03 2.59589e-02
## 2 1.87201e-01 9.99280e-05 5.13573e-02
## 3 9.98702e-01 9.98829e-05 9.98829e-05
## 4 9.99550e-05 3.37992e-02 5.25750e-02
## 5 9.99369e-05 9.99369e-05 9.99369e-05
## 6 2.18301e-01 4.20388e-02 9.99460e-05
Rename the columns
# Rename the columns starting from the third one
leak14 <- leak14 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak14$pop[leak14$pop == 'SCH'] <- 'STS'
# View the first few rows
head(leak14)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 9.99640e-05 2.94875e-03 3.33670e-01 3.90726e-01 9.99640e-05
## 2 1066 SOC 9.99280e-05 9.99280e-05 6.70722e-01 7.74045e-02 4.12325e-03
## 3 1067 SOC 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05
## 4 1068 SOC 7.70742e-03 8.36124e-03 4.72139e-01 3.71933e-01 9.99550e-05
## 5 1069 SOC 1.99983e-02 6.28003e-02 4.45023e-01 4.07885e-01 9.99369e-05
## 6 1070 SOC 7.49068e-03 2.06640e-02 2.88870e-01 2.90137e-01 9.99460e-05
## v6 v7 v8 v9 v10 v11
## 1 3.34744e-03 9.99640e-05 9.99640e-05 5.59063e-03 4.51920e-03 1.08335e-01
## 2 8.39170e-03 9.99280e-05 9.99280e-05 9.99280e-05 9.99280e-05 9.99280e-05
## 3 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05 9.98829e-05
## 4 9.99550e-05 1.15968e-02 2.73743e-02 1.40145e-02 9.99550e-05 9.99550e-05
## 5 2.75847e-03 1.11527e-02 9.99369e-05 9.99369e-05 9.99369e-05 4.96825e-02
## 6 9.99460e-05 9.99460e-05 9.99460e-05 1.91483e-02 9.99460e-05 1.12750e-01
## v12 v13 v14
## 1 1.15669e-01 8.83548e-03 2.59589e-02
## 2 1.87201e-01 9.99280e-05 5.13573e-02
## 3 9.98702e-01 9.98829e-05 9.98829e-05
## 4 9.99550e-05 3.37992e-02 5.25750e-02
## 5 9.99369e-05 9.99369e-05 9.99369e-05
## 6 2.18301e-01 4.20388e-02 9.99460e-05
Import Sample Locations
## Pop_City Country Latitude Longitude Continent Abbreviation Year
## 1 Berlin, NJ USA 39.79081 -74.9291 Americas BER 2018
## 2 Columbus, OH USA 39.97170 -82.9071 Americas COL 2015
## 3 Palm Beach USA 26.70560 -80.0364 Americas PAL 2018
## 4 Houston, TX USA 29.75491 -95.3505 Americas HOU 2018
## 5 Los Angeles USA 34.05220 -118.2437 Americas LOS 2018
## 6 Manaus, AM Brazil -3.09161 -60.0325 Americas MAU 2017
## Region Subregion order order2 orderold
## 1 North America 1 NA 75
## 2 North America 2 NA 76
## 3 North America 3 NA 77
## 4 North America 4 NA 78
## 5 North America 5 NA 79
## 6 South America 6 NA 80
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak14 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_14 <-
c(
"green",
"#F49AC2",
"yellow",
"#FF8C1A",
"purple",
"green4",
"blue",
"#B20CD9",
"purple4",
"#77DD77",
"#008080",
"#1E90FF",
"#B22222",
"chocolate4")
# Generate all potential variable names
all_variables <- paste0("v", 1:14)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_14[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=14.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_14) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak10 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K10/run5/r2_0.01_b_r5.10.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak10)
## # A tibble: 6 × 10
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.64e-4 1.00e-4 0.254 1.00e-4 1.06e-1 1.95e-2 5.10e-1 1.00e-4 1.00e-4 1.10e-1
## 2 1.94e-3 1.00e-4 0.402 1.12e-2 8.40e-2 1.00e-4 4.70e-1 1.00e-4 1.43e-2 1.62e-2
## 3 9.99e-5 9.99e-5 0.999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 1.00e-4 1.15e-2 0.159 3.49e-2 8.42e-2 4.02e-2 6.06e-1 1.00e-4 5.36e-3 5.92e-2
## 5 1.88e-3 1.94e-3 0.117 1.00e-4 1.06e-1 1.00e-4 6.56e-1 1.60e-2 1.92e-2 8.18e-2
## 6 4.93e-2 1.30e-2 0.284 1.00e-4 1.00e-4 3.85e-2 4.67e-1 1.00e-4 1.00e-4 1.48e-1
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak10 <- leak10 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak10$pop[leak10$pop == 'SCH'] <- 'STS'
Import Sample Locations
color_palette_10 <-
c(
"#FF8C1A",
"purple",
"green",
"#F49AC2",
"#B22222",
"blue",
"#1E90FF",
"yellow2",
"#77DD77",
"purple4"
)
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak10 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_10 <-
c(
"purple",
"blue",
"#F49AC2",
"green",
"#B22222",
"#77DD77",
"purple4",
"#FF8C1A",
"yellow2",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:10)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_10[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=10.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_10) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak11 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K11/run5/r2_0.01_b_r5.11.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak11)
## # A tibble: 6 × 11
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9.46e-2 0.221 4.55e-2 8.79e-4 1.47e-2 5.64e-1 3.30e-3 4.48e-3 2.43e-2 2.72e-2
## 2 9.99e-5 0.350 8.71e-2 3.28e-3 9.99e-5 5.53e-1 9.99e-5 9.99e-5 9.99e-5 5.71e-3
## 3 9.99e-5 0.999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 2.08e-2 0.142 4.54e-2 7.08e-3 1.36e-2 6.23e-1 1.40e-2 2.69e-2 6.31e-2 3.64e-2
## 5 8.18e-2 0.0688 2.63e-2 3.60e-2 1.41e-2 6.86e-1 1.00e-4 2.61e-2 4.30e-2 1.00e-4
## 6 1.16e-1 0.293 1.00e-4 1.00e-4 4.15e-2 4.47e-1 1.00e-4 6.16e-2 1.00e-4 4.09e-2
## # ℹ 1 more variable: X11 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak11 <- leak11 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
#replace SCH with STS
leak11$pop[leak11$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak11 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_11 <-
c(
"#FF8C1A",
"#77DD77",
"#F49AC2",
"green",
"#1E90FF",
"purple",
"blue",
"yellow2",
"purple4",
"#FFB347",
"#B22222"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:11)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_11[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=11.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_11) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak12 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K12/run5/r2_0.01_b_r5.12.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak12)
## # A tibble: 6 × 12
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.257 1.00e-4 1.00e-4 8.21e-3 4.52e-3 2.69e-2 1.83e-2 1.00e-4 5.02e-1 1.20e-1
## 2 0.364 1.00e-4 1.99e-2 1.00e-4 3.84e-2 1.00e-4 1.20e-2 1.00e-4 5.11e-1 1.00e-4
## 3 0.999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.148 4.23e-2 7.75e-2 9.32e-3 4.58e-2 1.00e-4 2.64e-3 1.00e-4 5.86e-1 1.31e-2
## 5 0.103 1.00e-4 1.00e-4 1.99e-2 4.29e-2 1.00e-4 1.68e-2 3.98e-2 6.80e-1 8.03e-2
## 6 0.256 6.00e-2 1.00e-4 1.00e-4 1.13e-2 1.00e-4 1.00e-4 3.22e-2 4.98e-1 1.20e-1
## # ℹ 2 more variables: X11 <dbl>, X12 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak12 <- leak12 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak12$pop[leak12$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak12 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_12 <-
c(
"purple",
"#B22222",
"#1E90FF",
"#FF8C1A",
"#F49AC2",
"green4",
"chocolate4",
"#77DD77",
"purple4",
"blue",
"yellow2",
"green"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:12)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_12[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=12.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_12) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak13 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K13/run5/r2_0.01_b_r5.13.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak13)
## # A tibble: 6 × 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.00e-4 2.40e-2 0.209 1.28e-3 4.00e-2 1.00e-4 1.00e-4 8.70e-2 7.83e-4 7.62e-3
## 2 1.00e-4 5.03e-2 0.336 1.00e-4 1.00e-4 1.56e-2 4.20e-3 1.00e-4 9.43e-4 4.31e-3
## 3 9.99e-5 9.99e-5 0.999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 7.12e-3 5.18e-3 0.138 2.42e-3 5.99e-2 7.41e-2 3.83e-2 4.44e-2 1.00e-4 2.55e-3
## 5 3.18e-2 4.07e-2 0.0755 1.00e-4 1.00e-4 7.42e-4 6.50e-2 2.25e-3 5.02e-3 6.10e-4
## 6 1.00e-4 2.44e-2 0.287 1.00e-4 5.36e-2 1.00e-4 9.51e-3 5.46e-3 1.00e-4 3.89e-2
## # ℹ 3 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak13 <- leak13 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak13$pop[leak13$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak13 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_13 <-
c(
"#FF8C1A",
"green",
"purple",
"#B22222",
"#75FAFF",
"#F49AC2",
"green4",
"#008080",
"purple4",
"#77DD77",
"#1E90FF",
"blue",
"yellow2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:13)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_13[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=13.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_13) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak15 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K15/run5/r2_0.01_b_r5.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak15)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.405 0.0987 0.0262 3.51e-1 9.99e-5 6.20e-2 9.99e-5 8.15e-3 9.99e-5
## 2 0.0771 0.190 0.0543 6.65e-1 9.99e-5 9.99e-5 9.99e-5 9.86e-3 3.42e-3
## 3 0.0000999 0.999 0.0000999 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5 9.99e-5
## 4 0.351 0.0000999 0.0345 4.67e-1 9.99e-5 2.45e-2 9.99e-5 2.12e-2 4.32e-2
## 5 0.400 0.0000999 0.0112 4.49e-1 7.34e-4 5.16e-2 6.36e-2 9.95e-3 9.99e-5
## 6 0.294 0.197 0.0000999 2.99e-1 3.41e-2 1.10e-1 9.99e-5 9.99e-5 1.18e-2
## # ℹ 6 more variables: X10 <dbl>, X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # X15 <dbl>
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak15 <- leak15 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak15$pop[leak15$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak15 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_15 <-
c(
"#008080",
"#F49AC2",
"#B22222",
"chocolate4",
"#77DD77",
"green",
"#B20CC9",
"#FF8C1A",
"#1E90FF",
"#FFB347",
"purple",
"#75FAFF",
"blue",
"purple4",
"yellow2"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:15)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_15[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=15.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_15) +
expand_limits(y = c(0, 1.5))
## K = 5
## run 1 0.8997042
## run 2 0.9005286
## run 3 0.9010259
## run 4 0.8994445
## run 5 0.8985523
change to correct matrix
leak5 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K5/run5/r2_0.01_b_r5.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.182 0.0166 0.736 0.00556 0.0595
## 2 0.0875 0.0211 0.868 0.0230 0.000100
## 3 0.000100 0.000100 0.994 0.000100 0.00605
## 4 0.120 0.0293 0.726 0.0743 0.0504
## 5 0.178 0.0409 0.741 0.0106 0.0292
## 6 0.154 0.0490 0.706 0.000100 0.0902
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak5$pop[leak5$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak5 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_5 <-
c(
"#77DD37",
"purple3",
"#1E90FF",
"#FFFF19",
"#FF8C1A"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:5)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_5[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=5.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_5) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K4/run5/r2_0.01_b_r5.4.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak4)
## # A tibble: 6 × 4
## X1 X2 X3 X4
## <dbl> <dbl> <dbl> <dbl>
## 1 0.105 0.0501 0.0756 0.770
## 2 0.0636 0.0367 0.000100 0.900
## 3 0.000100 0.000100 0.000100 1.00
## 4 0.112 0.0796 0.0654 0.743
## 5 0.146 0.0250 0.0632 0.766
## 6 0.123 0.000806 0.112 0.764
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak4 <- leak4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak4$pop[leak4$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_4 <-
c(
"#FF8C1A",
"#1E90FF",
"#77DD37",
"purple3"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:4)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_4[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=4.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_4) +
expand_limits(y = c(0, 1.5))
## [1] 5
change to correct matrix
leak3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K3/run5/r2_0.01_b_r5.3.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.111 0.775 0.114
## 2 0.0982 0.892 0.00993
## 3 0.000100 1.00 0.000100
## 4 0.167 0.748 0.0848
## 5 0.148 0.768 0.0831
## 6 0.0932 0.779 0.128
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
Add it to the matrix
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
leak3$pop[leak3$pop == 'SCH'] <- 'STS'
Import Sample Locations
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette_3 <-
c(
"purple3",
"#FF8C1A",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:3)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette_3[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=3.\n LEA inference for 20,968 SNPs (MAF 1%).") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette_3) +
expand_limits(y = c(0, 1.5))
library(raster) # important to load before tidyverse, otherwise it masks select()
library(tidyverse)
library(scatterpie)
library(sf)
library(ggspatial)
library(ggplot2)
library(dplyr)
library(colorout)
library(here)
library(extrafont)
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)
library(ggrepel)
library(Cairo)
project = load.snmfProject("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmfProject")
Import samples attributes
#data<-read.csv("sampling_loc_all.csv", stringsAsFactors = TRUE)
#write_rds(data, "sampling_loc_all.rds")
sampling_loc <- readRDS(here("output", "sampling_loc_euro_global.rds"))
# head(sampling_loc)
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Subregion, Year, order
)
head(pops)
## Abbreviation Latitude Longitude Pop_City Country Region
## 1 FRS 45.16531 5.771806 Saint-Martin-d'Heres France Western Europe
## 2 STS 48.61124 7.754512 Strasbourg France Western Europe
## 3 POP 41.18555 -8.329371 Penafiel Portugal Southern Europe
## 4 POL 37.09084 -8.092465 Loule Portugal Southern Europe
## 5 SPB 38.86622 -6.974194 Badajoz Spain Southern Europe
## 6 SPS 36.17042 -5.371530 San Roque Spain Southern Europe
## Subregion Year order
## 1 West Europe 2019 9
## 2 West Europe 2019 10
## 3 West Europe 2017 11
## 4 West Europe 2017 12
## 5 West Europe 2018 13
## 6 West Europe 2017 14
Import the Q matrix (K15 for LEA)
Select a Q matrix from one of the runs for the best k
## [1] 5
# Extract ancestry coefficients
leak15 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K15/run5/r2_0.01_b_r5.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak15 <- leak15 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak15)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.405 0.0987 0.0262 0.351 0.0001 0.0620 0.0001 0.00815 0.0001 0.0133
## 2 0.0771 0.190 0.0543 0.665 0.0001 0.0001 0.0001 0.00986 0.00342 0.0001
## 3 0.0001 0.999 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
## 4 0.351 0.0001 0.0345 0.467 0.0001 0.0245 0.0001 0.0212 0.0432 0.0171
## 5 0.400 0.0001 0.0112 0.449 0.000734 0.0516 0.0636 0.00995 0.0001 0.0001
## 6 0.294 0.197 0.0001 0.299 0.0341 0.110 0.0001 0.0001 0.0118 0.0180
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak15 <- leak15 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak15$index <- seq_len(nrow(leak15))
# Perform the merge as before
df1 <-
merge(
leak15,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.404928 0.098722 0.026179 0.350744 0.000100 0.061956 0.000100
## 311 SOC 1066 0.077053 0.189667 0.054270 0.664839 0.000100 0.000100 0.000100
## 312 SOC 1067 0.000100 0.998602 0.000100 0.000100 0.000100 0.000100 0.000100
## 313 SOC 1068 0.351429 0.000100 0.034498 0.466733 0.000100 0.024473 0.000100
## 314 SOC 1069 0.400130 0.000100 0.011186 0.448608 0.000734 0.051617 0.063612
## 315 SOC 1070 0.293928 0.196814 0.000100 0.299322 0.034075 0.109669 0.000100
## v8 v9 v10 v11 v12 v13 v14 v15
## 310 0.008147 0.000100 0.013342 0.035282 0.000100 0.000100 0.000100 0.000100
## 311 0.009857 0.003415 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 312 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 313 0.021249 0.043225 0.017112 0.000100 0.028056 0.000100 0.012626 0.000100
## 314 0.009947 0.000100 0.000100 0.000100 0.000100 0.005109 0.000100 0.008458
## 315 0.000100 0.011779 0.017993 0.015304 0.000100 0.000100 0.020518 0.000100
## Latitude Longitude Pop_City Country Region Subregion Year order
## 310 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 311 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 312 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 313 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 314 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 315 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
Q-values for k=15
make a palette with 15 colors
colors2 <-c(
"v1" = "#B20CD9",
"v2" = "purple4",
"v3" = "#F49AC2",
"v4" = "purple",
"v5" = "#B22222",
"v6" = "yellow2",
"v7" = "#FF8C1A",
"v8" = "chocolate4",
"v9" = "#FFB347",
"v10" = "#008080",
"v11" = "#75FAFF",
"v12" = "green",
"v13" = "#1E90FF",
"v14" = "#77DD77",
"v15" = "blue"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#B20CD9" "purple4" "#F49AC2" "purple" "#B22222" "yellow2"
## v7 v8 v9 v10 v11 v12
## "#FF8C1A" "chocolate4" "#FFB347" "#008080" "#75FAFF" "green"
## v13 v14 v15
## "#1E90FF" "#77DD77" "blue"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_MAF1_r01_k15_pie_all_countries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K5 for LEA) Select a Q matrix from one of the runs for the best k
## [1] 5
# Extract ancestry coefficients
leak5 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K5/run5/r2_0.01_b_r5.5.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak5 <- leak5 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak5)
## # A tibble: 6 × 5
## X1 X2 X3 X4 X5
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.182 0.0166 0.736 0.00556 0.0595
## 2 0.0875 0.0211 0.868 0.0230 0.0001
## 3 0.0001 0.0001 0.994 0.0001 0.00604
## 4 0.120 0.0293 0.726 0.0743 0.0504
## 5 0.178 0.0409 0.741 0.0106 0.0292
## 6 0.154 0.0490 0.706 0.0001 0.0902
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak5 <- leak5 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak5$index <- seq_len(nrow(leak5))
# Perform the merge as before
df1 <-
merge(
leak5,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 Latitude Longitude
## 310 SOC 1065 0.182089 0.016569 0.736277 0.005563 0.059503 43.60042 39.74533
## 311 SOC 1066 0.087534 0.021107 0.868272 0.022987 0.000100 43.60042 39.74533
## 312 SOC 1067 0.000100 0.000100 0.993655 0.000100 0.006045 43.60042 39.74533
## 313 SOC 1068 0.120267 0.029297 0.725733 0.074326 0.050377 43.60042 39.74533
## 314 SOC 1069 0.177980 0.040930 0.741311 0.010575 0.029204 43.60042 39.74533
## 315 SOC 1070 0.154278 0.049038 0.706338 0.000100 0.090246 43.60042 39.74533
## Pop_City Country Region Subregion Year order
## 310 Sochi Russia Eastern Europe East Europe 2021 46
## 311 Sochi Russia Eastern Europe East Europe 2021 46
## 312 Sochi Russia Eastern Europe East Europe 2021 46
## 313 Sochi Russia Eastern Europe East Europe 2021 46
## 314 Sochi Russia Eastern Europe East Europe 2021 46
## 315 Sochi Russia Eastern Europe East Europe 2021 46
Q-values for k=5
make a palette with 5 colors
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_MAF1_r01_k5_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Import the Q matrix (K15 for LEA) Select a Q matrix from one of the runs for the best k
## [1] 5
# Extract ancestry coefficients
leak14 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K14/run5/r2_0.01_b_r5.14.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak14 <- leak14 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak14)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0001 0.00295 0.334 0.391 0.0001 0.00335 0.0001 0.0001 0.00559 0.00452
## 2 0.0001 0.0001 0.671 0.0774 0.00412 0.00839 0.0001 0.0001 0.0001 0.0001
## 3 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
## 4 0.00771 0.00836 0.472 0.372 0.0001 0.0001 0.0116 0.0274 0.0140 0.0001
## 5 0.0200 0.0628 0.445 0.408 0.0001 0.00276 0.0112 0.0001 0.0001 0.0001
## 6 0.00749 0.0207 0.289 0.290 0.0001 0.0001 0.0001 0.0001 0.0191 0.0001
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak14 <- leak14 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak14$index <- seq_len(nrow(leak14))
# Perform the merge as before
df1 <-
merge(
leak14,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.000100 0.002949 0.333670 0.390726 0.000100 0.003347 0.000100
## 311 SOC 1066 0.000100 0.000100 0.670722 0.077404 0.004123 0.008392 0.000100
## 312 SOC 1067 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 313 SOC 1068 0.007707 0.008361 0.472139 0.371933 0.000100 0.000100 0.011597
## 314 SOC 1069 0.019998 0.062800 0.445023 0.407885 0.000100 0.002758 0.011153
## 315 SOC 1070 0.007491 0.020664 0.288870 0.290137 0.000100 0.000100 0.000100
## v8 v9 v10 v11 v12 v13 v14 Latitude
## 310 0.000100 0.005591 0.004519 0.108335 0.115669 0.008835 0.025959 43.60042
## 311 0.000100 0.000100 0.000100 0.000100 0.187201 0.000100 0.051357 43.60042
## 312 0.000100 0.000100 0.000100 0.000100 0.998702 0.000100 0.000100 43.60042
## 313 0.027374 0.014014 0.000100 0.000100 0.000100 0.033799 0.052575 43.60042
## 314 0.000100 0.000100 0.000100 0.049682 0.000100 0.000100 0.000100 43.60042
## 315 0.000100 0.019148 0.000100 0.112750 0.218301 0.042039 0.000100 43.60042
## Longitude Pop_City Country Region Subregion Year order
## 310 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 311 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 312 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 313 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 314 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 315 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
Q-values for k=14
make a palette with 14 colors
colors2 <-c(
"v1" = "chocolate4",
"v2" = "#FF8C1A",
"v3" = "#B20CD9",
"v4" = "purple4",
"v5" = "#008080",
"v6" = "blue",
"v7" = "#1E90FF",
"v8" = "green4",
"v9" = "#B22222",
"v10" = "green",
"v11" = "yellow2",
"v12" = "purple",
"v13" = "#77DD77",
"v14" = "#F49AC2"
)
colors2
## v1 v2 v3 v4 v5 v6
## "chocolate4" "#FF8C1A" "#B20CD9" "purple4" "#008080" "blue"
## v7 v8 v9 v10 v11 v12
## "#1E90FF" "green4" "#B22222" "green" "yellow2" "purple"
## v13 v14
## "#77DD77" "#F49AC2"
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_MAF1_r01_k14_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
Make pie plot
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# Extract ancestry coefficients
leak14 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K14/run5/r2_0.01_b_r5.14.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak14 <- leak14 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak14)
## # A tibble: 6 × 14
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0001 0.00295 0.334 0.391 0.0001 0.00335 0.0001 0.0001 0.00559 0.00452
## 2 0.0001 0.0001 0.671 0.0774 0.00412 0.00839 0.0001 0.0001 0.0001 0.0001
## 3 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
## 4 0.00771 0.00836 0.472 0.372 0.0001 0.0001 0.0116 0.0274 0.0140 0.0001
## 5 0.0200 0.0628 0.445 0.408 0.0001 0.00276 0.0112 0.0001 0.0001 0.0001
## 6 0.00749 0.0207 0.289 0.290 0.0001 0.0001 0.0001 0.0001 0.0191 0.0001
## # ℹ 4 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak14 <- leak14 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak14$index <- seq_len(nrow(leak14))
# Perform the merge as before
df1 <-
merge(
leak14,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.000100 0.002949 0.333670 0.390726 0.000100 0.003347 0.000100
## 311 SOC 1066 0.000100 0.000100 0.670722 0.077404 0.004123 0.008392 0.000100
## 312 SOC 1067 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 313 SOC 1068 0.007707 0.008361 0.472139 0.371933 0.000100 0.000100 0.011597
## 314 SOC 1069 0.019998 0.062800 0.445023 0.407885 0.000100 0.002758 0.011153
## 315 SOC 1070 0.007491 0.020664 0.288870 0.290137 0.000100 0.000100 0.000100
## v8 v9 v10 v11 v12 v13 v14 Latitude
## 310 0.000100 0.005591 0.004519 0.108335 0.115669 0.008835 0.025959 43.60042
## 311 0.000100 0.000100 0.000100 0.000100 0.187201 0.000100 0.051357 43.60042
## 312 0.000100 0.000100 0.000100 0.000100 0.998702 0.000100 0.000100 43.60042
## 313 0.027374 0.014014 0.000100 0.000100 0.000100 0.033799 0.052575 43.60042
## 314 0.000100 0.000100 0.000100 0.049682 0.000100 0.000100 0.000100 43.60042
## 315 0.000100 0.019148 0.000100 0.112750 0.218301 0.042039 0.000100 43.60042
## Longitude Pop_City Country Region Subregion Year order
## 310 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 311 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 312 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 313 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 314 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 315 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
Q-values for k=14
make a palette with 14 colors
colors2 <-c(
"v1" = "chocolate4",
"v2" = "#FF8C1A",
"v3" = "#B20CD9",
"v4" = "purple4",
"v5" = "#008080",
"v6" = "blue",
"v7" = "#1E90FF",
"v8" = "green4",
"v9" = "#B22222",
"v10" = "green",
"v11" = "yellow2",
"v12" = "purple",
"v13" = "#77DD77",
"v14" = "#F49AC2"
)
colors2
## v1 v2 v3 v4 v5 v6
## "chocolate4" "#FF8C1A" "#B20CD9" "purple4" "#008080" "blue"
## v7 v8 v9 v10 v11 v12
## "#1E90FF" "green4" "#B22222" "green" "yellow2" "purple"
## v13 v14
## "#77DD77" "#F49AC2"
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k14_pie_italy.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k14_pie_fareast.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k14_pie_iberia.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k14_pie_balkans_labels.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
# Extract ancestry coefficients
leak15 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.snmf/K15/run5/r2_0.01_b_r5.15.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak15 <- leak15 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak15)
## # A tibble: 6 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.405 0.0987 0.0262 0.351 0.0001 0.0620 0.0001 0.00815 0.0001 0.0133
## 2 0.0771 0.190 0.0543 0.665 0.0001 0.0001 0.0001 0.00986 0.00342 0.0001
## 3 0.0001 0.999 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
## 4 0.351 0.0001 0.0345 0.467 0.0001 0.0245 0.0001 0.0212 0.0432 0.0171
## 5 0.400 0.0001 0.0112 0.449 0.000734 0.0516 0.0636 0.00995 0.0001 0.0001
## 6 0.294 0.197 0.0001 0.299 0.0341 0.110 0.0001 0.0001 0.0118 0.0180
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>, X15 <dbl>
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak15 <- leak15 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak15$index <- seq_len(nrow(leak15))
# Perform the merge as before
df1 <-
merge(
leak15,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 310 SOC 1065 0.404928 0.098722 0.026179 0.350744 0.000100 0.061956 0.000100
## 311 SOC 1066 0.077053 0.189667 0.054270 0.664839 0.000100 0.000100 0.000100
## 312 SOC 1067 0.000100 0.998602 0.000100 0.000100 0.000100 0.000100 0.000100
## 313 SOC 1068 0.351429 0.000100 0.034498 0.466733 0.000100 0.024473 0.000100
## 314 SOC 1069 0.400130 0.000100 0.011186 0.448608 0.000734 0.051617 0.063612
## 315 SOC 1070 0.293928 0.196814 0.000100 0.299322 0.034075 0.109669 0.000100
## v8 v9 v10 v11 v12 v13 v14 v15
## 310 0.008147 0.000100 0.013342 0.035282 0.000100 0.000100 0.000100 0.000100
## 311 0.009857 0.003415 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 312 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100 0.000100
## 313 0.021249 0.043225 0.017112 0.000100 0.028056 0.000100 0.012626 0.000100
## 314 0.009947 0.000100 0.000100 0.000100 0.000100 0.005109 0.000100 0.008458
## 315 0.000100 0.011779 0.017993 0.015304 0.000100 0.000100 0.020518 0.000100
## Latitude Longitude Pop_City Country Region Subregion Year order
## 310 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 311 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 312 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 313 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 314 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
## 315 43.60042 39.74533 Sochi Russia Eastern Europe East Europe 2021 46
Q-values for k=15
make a palette with 15 colors
colors2 <-c(
"v1" = "#B20CD9",
"v2" = "purple4",
"v3" = "#F49AC2",
"v4" = "purple",
"v5" = "#B22222",
"v6" = "yellow2",
"v7" = "#FF8C1A",
"v8" = "chocolate4",
"v9" = "#FFB347",
"v10" = "#008080",
"v11" = "#75FAFF",
"v12" = "green",
"v13" = "#1E90FF",
"v14" = "#77DD77",
"v15" = "blue"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#B20CD9" "purple4" "#F49AC2" "purple" "#B22222" "yellow2"
## v7 v8 v9 v10 v11 v12
## "#FF8C1A" "chocolate4" "#FFB347" "#008080" "#75FAFF" "green"
## v13 v14 v15
## "#1E90FF" "#77DD77" "blue"
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k15_pie_italy.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "large", returnclass = "sf")
countries_with_data <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries2 <- world |>
filter(admin=="Italy")
# Calculate mean proportions for each population
df_mean <- df1 |> filter(Country=="Italy") |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries2, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(6, 20), ylim = c(36, 47)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k15_pie_fareast.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
countries_to_use<- c("Russia", "Ukraine", "Georgia", "Armenia")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(27, 48), ylim = c(38, 48)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k15_pie_iberia.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
#countries_to_use<-subset(df1,Country=="Russia"|"Ukraine"|"Georgia"|"Armenia")
#countries_to_use<- df1[df1$Country %in% c("Russia", "Ukraine", "Georgia", "Armenia"), ]
countries_to_use<- c("Portugal", "Spain", "France")
countries_with_data2 <- unique(df1$Country)
# Filtering the world data to include only the countries in your data
selected_countries3 <- world |>
filter(admin %in% countries_with_data2)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 6), ylim = c(34, 45)) +
my_theme()
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/MAF_1/LEA_r01_MAF1_k15_pie_balkans_labels.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
# Filtering the world data to include only the countries in your data
world <- ne_countries(scale = "large", returnclass = "sf")
selected_countries3 <- world
# Filtering the world data to include only the countries in your data
#selected_countries3 <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries3, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 0.7),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"),
color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(14, 30), ylim = c(34, 47)) +
my_theme()
Set 3 SNPs using only the 24 overlapping pops Check data - we created 2 vcf files with LD pruning r2<0.01 (LD1) and r2<0.1 (LD2) after QC
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.01_b.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1.vcf
## /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/snps_sets/r2_0.1_b.vcf
Subset just the 24 pops we want (for SNP Set 3)
module load PLINK/1.9b_6.21-x86_64
cd /gpfs/gibbs/pi/caccone/mkc54/albo
plink \
--keep-allele-order \
--keep-fam /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/pops_4bfst_overlap2.txt \
--bfile /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/r2_0.01_b \
--make-bed \
--export vcf \
--out /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap \
--silent
grep "samples\|variants" /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.log
20968 variants loaded from .bim file. Total genotyping rate in remaining samples is 0.973943. 20968 variants and 242 people pass filters and QC.
Import the data
genotype <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.vcf"
)
d <- read.vcfR(
genotype
)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 20968
## column count: 251
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 20968
## Character matrix gt cols: 251
## skip: 0
## nrows: 20968
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant: 20968
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD BAR BUL CRO FRS GES GRA GRC ITB ITP ITR MAL POP ROS SER SLO SOC SPB SPC SPM
## 10 12 10 12 12 12 11 10 5 8 16 12 12 11 4 12 12 8 6 6
## SPS STS TUA TUH
## 8 12 9 12
Convert format
##
## - number of detected individuals: 242
## - number of detected loci: 20968
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.removed file, for more informations.
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.geno"
##
## - number of detected individuals: 242
## - number of detected loci: 20968
##
## For SNP info, please check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.vcfsnp.
##
## 0 line(s) were removed because these are not SNPs.
## Please, check /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.removed file, for more informations.
##
##
## - number of detected individuals: 242
## - number of detected loci: 20968
## [1] "/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.lfmm"
## [1] "******************************"
## [1] " Principal Component Analysis "
## [1] "******************************"
## summary of the options:
##
## -n (number of individuals) 242
## -L (number of loci) 20968
## -K (number of principal components) 242
## -x (genotype file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.lfmm
## -a (eigenvalue file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.pca/r2_0.01_b_overlap.eigenvalues
## -e (eigenvector file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.pca/r2_0.01_b_overlap.eigenvectors
## -d (standard deviation file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.pca/r2_0.01_b_overlap.sdev
## -p (projection file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.pca/r2_0.01_b_overlap.projections
## -c data centered
## * pca class *
##
## project directory: /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/
## pca result directory: r2_0.01_b_overlap.pca/
## input file: r2_0.01_b_overlap.lfmm
## eigenvalue file: r2_0.01_b_overlap.eigenvalues
## eigenvector file: r2_0.01_b_overlap.eigenvectors
## standard deviation file: r2_0.01_b_overlap.sdev
## projection file: r2_0.01_b_overlap.projections
## pcaProject file: r2_0.01_b_overlap.pcaProject
## number of individuals: 242
## number of loci: 20968
## number of principal components: 242
## centered: TRUE
## scaled: FALSE
Test
## [1] "*******************"
## [1] " Tracy-Widom tests "
## [1] "*******************"
## summary of the options:
##
## -n (number of eigenvalues) 242
## -i (input file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.pca/r2_0.01_b_overlap.eigenvalues
## -o (output file) /gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.pca/r2_0.01_b_overlap.tracywidom
# tw$pvalues
# plot the percentage of variance explained by each component
plot(tw$percentage, pch = 19, col = "blue", cex = .8)
Get values
# plot preparation
pc.coord <- as.data.frame(pc$projections)
colnames(pc.coord) <- paste0("PC", 1:nPC)
pc.coord$Individual <- inds
pc.coord$Population <- pops
# perc1 <- paste0(round(tw$percentage, digits = 3) * 100, "%")
perc <- paste0(round(pc$eigenvalues/sum(pc$eigenvalues), digits = 3) * 100, "%")
nb.cols <- 40
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
Check R symbols for plot
#to see all shapes -> plot shapes - para escolher os simbolos
N = 100; M = 1000
good.shapes = c(1:25,33:127)
foo = data.frame( x = rnorm(M), y = rnorm(M), s = factor( sample(1:N, M, replace = TRUE) ) )
ggplot(aes(x,y,shape=s ), data=foo ) +
scale_shape_manual(values=good.shapes[1:N]) +
geom_point()
Sample data
Import sample locations
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
# Check it
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
Check pops
## [1] SOC SOC SOC SOC SOC SOC
## 24 Levels: ALD BAR BUL CRO FRS GES GRA GRC ITB ITP ITR MAL POP ROS SER ... TUH
## [1] 24
Check the regions
## [1] "Western Europe" "Southern Europe" "Eastern Europe"
Merge
merged_data <- merge(pc.coord, sampling_loc, by.x = "Population", by.y = "Abbreviation")
head(merged_data)
## Population PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 1 ALD 6.87147 5.26163 27.5426 -19.6432 -1.80911 -16.15430 -7.81956
## 2 ALD 4.31109 5.21952 22.2495 -21.5589 -1.56090 -11.71560 -3.86349
## 3 ALD 7.07602 8.56385 30.9815 -23.5806 -1.94804 -12.43620 -4.15755
## 4 ALD 10.26780 13.57850 22.6012 -18.8156 -4.95456 -13.01880 -11.34520
## 5 ALD 6.39935 4.97381 25.7990 -20.7505 1.63662 -10.90080 -1.56123
## 6 ALD 6.21099 2.74610 24.2210 -21.6515 -1.67445 -3.44195 -1.81624
## PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15
## 1 -10.36500 24.24300 16.2737 -7.95810 -0.498578 1.20131 -3.485010 1.506300
## 2 -7.48055 18.23330 13.7299 -9.51062 -7.349620 -2.40134 -3.059150 -0.629993
## 3 -9.32047 20.44310 13.7023 -15.85640 -6.098600 -11.91850 5.453580 -5.998260
## 4 -9.69886 22.37030 17.0842 -7.18936 1.032260 2.26645 -0.729475 0.840676
## 5 -5.88371 4.46845 10.8781 -5.67955 14.639100 2.29763 -3.197030 6.576330
## 6 -12.28750 15.13050 16.7096 12.50090 -9.807510 -3.40354 -5.002050 -5.578810
## PC16 PC17 PC18 PC19 PC20 PC21 PC22
## 1 -1.463860 -4.489200 -3.68705 -7.98787 -3.150610 11.47320 -2.755400
## 2 -0.943154 -6.281590 -8.13488 -2.41196 -7.435220 9.66567 -6.217530
## 3 0.820020 -23.688600 6.67702 4.60626 -16.249500 12.11330 -2.230020
## 4 3.611120 -0.269745 -5.88539 -14.26490 2.317960 4.93515 -3.160880
## 5 5.513370 0.359856 2.63026 2.62911 -0.571211 -3.75854 6.898130
## 6 -1.420560 -7.737260 -4.61883 1.82112 -1.939070 3.34233 0.415984
## PC23 PC24 PC25 PC26 PC27 PC28 PC29
## 1 -1.209150 6.01979 -11.375300 -9.33476 22.445000 -0.761356 -4.396160
## 2 -4.802590 1.41351 -10.844100 -13.07420 22.932800 -3.502590 -6.749350
## 3 -3.995680 -22.78690 -0.210035 -3.74492 -4.377440 3.290400 0.740275
## 4 0.254776 3.18717 -4.665940 -6.39446 23.826700 4.145970 -4.746670
## 5 2.052520 9.38321 1.942080 -13.93080 0.486398 -12.355500 -6.030110
## 6 -8.158160 -1.87915 -3.275570 1.57457 5.264370 -3.856800 -5.605450
## PC30 PC31 PC32 PC33 PC34 PC35 PC36
## 1 -5.27034 10.042600 1.432490 8.04307 9.420950 -6.822020 5.42915
## 2 -4.17410 11.047600 5.014480 6.34119 13.777000 -4.385260 6.73198
## 3 31.03320 -14.516600 1.112310 -6.48253 -16.036400 0.665467 -12.03490
## 4 -11.76620 22.929800 0.502217 5.75076 7.159700 -8.690370 5.00697
## 5 -13.86070 -0.513034 4.514890 8.86187 12.313800 -5.950360 4.76725
## 6 -7.83385 6.277840 5.399910 8.29087 -0.274807 -5.516940 5.98567
## PC37 PC38 PC39 PC40 PC41 PC42 PC43
## 1 6.89639 0.902338 -1.74601 -4.31855 -3.337790 6.46373 -4.274400
## 2 -4.26179 5.913240 -3.23657 -2.16951 -1.316360 2.36516 -2.586510
## 3 -27.98710 -7.181260 10.22090 1.46137 11.015400 -15.01210 -0.513531
## 4 3.82930 -0.710736 -6.23698 -2.17686 -0.212686 4.03929 -1.255210
## 5 6.00598 3.462410 10.27540 9.65792 -1.232200 5.44665 6.679330
## 6 1.27482 21.206000 -5.23464 -9.49990 -10.516900 3.65990 -0.745378
## PC44 PC45 PC46 PC47 PC48 PC49 PC50
## 1 -4.889670 -0.834885 2.632470 -0.949742 -1.75957 2.03735 -4.14861
## 2 -4.521860 4.479540 5.391990 3.344440 -10.44580 6.27524 2.00893
## 3 -0.254961 -3.320130 7.475980 -1.928010 6.67487 3.30003 3.99177
## 4 6.994950 0.306433 -0.271766 -6.989500 2.39986 -6.97966 -1.68287
## 5 -8.846900 -2.664450 3.262430 3.322630 4.21955 9.16717 -11.14710
## 6 -4.847610 -14.295000 4.741950 6.266940 -13.93630 -10.30000 2.98200
## PC51 PC52 PC53 PC54 PC55 PC56 PC57 PC58
## 1 1.69663 5.04795 0.941439 1.75767 -5.06479 2.274770 0.0285029 1.039770
## 2 4.25362 4.89318 6.845900 3.48677 -3.56503 1.796720 -3.1626500 -4.166280
## 3 -17.33900 -7.59901 2.815900 -6.88345 -1.61298 -1.377150 3.7605800 0.899097
## 4 -1.77692 2.17802 -0.762373 -5.51989 -3.92726 0.826916 -5.3226100 -3.103370
## 5 6.64275 -4.47418 6.559260 9.33270 7.65505 -5.678200 2.0494900 5.498440
## 6 5.53692 -9.33166 -2.210260 -2.70257 0.12307 8.670370 9.0076700 5.934780
## PC59 PC60 PC61 PC62 PC63 PC64 PC65 PC66
## 1 -3.80141 -2.36701 0.330110 5.53471 6.667620 -1.44503 -2.98581 -1.060080
## 2 -5.50705 10.47880 -8.008920 -3.88921 4.165510 1.67699 -4.26165 -1.484730
## 3 -6.99078 -4.03167 -5.062330 1.60594 -0.248773 -5.11569 -4.35522 0.643318
## 4 1.78482 4.91530 -0.221456 9.28778 2.239000 10.78160 -6.17587 -7.177160
## 5 -3.24248 -3.56364 -16.078400 4.53752 7.062130 -1.82881 1.58330 8.035880
## 6 -4.98620 6.06547 5.290070 -9.24462 -10.634700 -6.64210 8.87078 -5.288190
## PC67 PC68 PC69 PC70 PC71 PC72 PC73 PC74
## 1 -2.55903 4.341470 -5.18417 -0.508175 -3.754400 5.931520 -1.02693 0.693196
## 2 2.51353 0.185915 3.85358 -8.944010 3.321050 6.594450 -2.22094 4.667610
## 3 -2.41188 -3.537930 4.00229 -4.936820 -1.373640 0.435795 -6.22612 0.375516
## 4 -7.88631 1.903800 8.98849 5.582710 -4.629030 -0.453036 -2.79199 -7.336970
## 5 2.00610 -7.835620 3.20214 -12.637800 -0.395118 -1.369610 7.74331 -1.029260
## 6 -1.22522 0.701698 -9.98630 -3.389650 11.210500 -0.417328 7.25652 9.353620
## PC75 PC76 PC77 PC78 PC79 PC80 PC81
## 1 8.943920 -1.97047 -9.492210 -1.738910 -6.791550 5.601680 5.9793200
## 2 -2.821130 -1.02472 -10.137400 1.078810 0.583585 13.054000 13.7605000
## 3 -3.768870 4.79239 -1.184330 -1.748970 -3.932040 -0.732289 -2.5970100
## 4 9.083830 -4.09803 1.560900 0.455056 3.083980 -6.221410 -8.6498400
## 5 4.609000 5.35377 8.745130 -5.812890 -9.297840 5.535630 6.6904500
## 6 -0.821431 -10.33440 0.530521 -14.417200 -7.216930 -1.200610 0.0949446
## PC82 PC83 PC84 PC85 PC86 PC87 PC88
## 1 -8.729610 3.13340 1.95181 10.323700 2.30143 -0.738303 1.126160
## 2 3.524580 -3.02701 11.26160 9.869880 1.25232 4.446760 7.722050
## 3 -0.547045 -2.21873 -4.48130 0.351344 5.19444 -1.979860 0.272153
## 4 -3.874960 13.46160 -2.93988 -0.649672 -7.65892 6.273620 -4.998780
## 5 9.202600 -12.26820 -3.62705 -9.345940 4.02577 -8.890660 -5.643380
## 6 11.740400 -12.99900 -3.62824 -15.275400 9.25467 3.723110 12.642400
## PC89 PC90 PC91 PC92 PC93 PC94 PC95
## 1 -0.690100 -11.90290 0.199257 8.401910 -4.8179800 3.47306 0.808408
## 2 -4.007930 -8.36975 14.283100 0.378127 -6.4248400 12.41980 -3.120790
## 3 0.376994 -1.09612 2.795670 -6.971720 0.0575227 2.76754 4.791860
## 4 5.904950 8.45221 -3.737630 -1.818050 0.7750060 -7.20450 1.925880
## 5 10.952400 -6.23835 2.667120 -3.304150 0.1166090 -8.10624 -6.779030
## 6 -7.265100 1.26762 -3.816030 -5.919410 3.2509600 10.06920 11.836400
## PC96 PC97 PC98 PC99 PC100 PC101 PC102 PC103
## 1 -9.838930 8.828700 0.337825 5.587130 -2.33368 2.35843 -9.982920 8.03262
## 2 -0.913488 -1.716650 -0.920745 -9.678150 -2.42400 -6.65705 -2.459040 1.39539
## 3 2.594700 -0.342536 -0.661906 -2.506140 0.70618 -1.31657 -0.587811 -4.06532
## 4 -0.207329 7.704440 14.023800 1.444190 -5.80034 4.87730 -1.842140 -4.00806
## 5 -1.648090 14.503800 -12.380200 -1.752890 11.70830 -5.92960 13.988000 16.56240
## 6 1.079430 -4.673990 -11.446000 -0.539707 2.30793 -2.90635 -6.188520 1.91365
## PC104 PC105 PC106 PC107 PC108 PC109 PC110
## 1 -3.3762200 0.752139 1.821930 -5.777340 -2.2593800 2.34481 0.8720420
## 2 9.4813900 3.539750 20.305800 -2.822180 -7.3995100 -16.27350 -5.6547700
## 3 0.0603229 -0.569457 0.232628 1.763520 -1.1373300 3.13001 -0.0986376
## 4 5.9644900 10.692900 -9.489190 3.896370 -6.2623100 4.55672 -1.4935300
## 5 -10.7812000 -6.915540 5.010980 1.080820 7.1101600 -5.55810 -1.5897200
## 6 -5.4519600 -10.176000 -11.139200 0.687877 0.0851489 -3.44785 -4.8365400
## PC111 PC112 PC113 PC114 PC115 PC116 PC117 PC118
## 1 2.948270 10.18730 4.08040 -7.996500 2.00662 -3.732110 -7.84511 3.31812
## 2 2.881780 1.81042 -3.52328 -7.269180 -21.72760 4.923060 -4.40725 4.96985
## 3 0.432105 2.04992 -1.13583 -3.068300 4.42717 -0.345268 -4.60966 -6.47014
## 4 -1.886870 -7.36717 1.55600 0.292359 7.08347 8.578810 -6.03043 5.12692
## 5 8.191170 5.61286 3.58951 -5.268760 -5.41897 -5.108210 9.92263 -7.51006
## 6 8.675660 -11.63880 -9.61435 10.512300 5.73394 -1.829660 4.80891 -10.24450
## PC119 PC120 PC121 PC122 PC123 PC124 PC125 PC126
## 1 3.64098 5.47077 -4.64693 3.677880 7.47611 -4.06151 -5.911430 2.059690
## 2 15.18370 -12.63590 -9.75064 -11.960100 -1.61328 4.31152 4.228860 0.586107
## 3 0.20213 1.06554 1.16694 -0.439249 -2.94780 -3.14326 0.209242 -2.826610
## 4 -3.08938 7.54306 3.31451 1.022480 -6.35207 -5.01984 5.971160 -2.368150
## 5 -2.57756 11.95780 -1.82469 -0.858057 2.45102 4.79712 3.764130 4.796640
## 6 -5.76753 -1.05282 -6.44558 2.297340 8.29517 6.84979 6.161040 -0.548687
## PC127 PC128 PC129 PC130 PC131 PC132 PC133 PC134
## 1 -4.894170 1.15444 3.460110 -3.28217 6.62111 -3.176930 3.510010 -4.14023
## 2 -5.803120 9.34380 -3.820230 1.34970 1.37499 -0.499883 0.539114 -4.27765
## 3 0.751653 2.53610 -2.247280 -2.65949 1.42226 3.786320 -2.931290 1.09076
## 4 5.146660 -4.32904 -9.205080 -5.62199 -2.30694 5.028540 -5.407290 7.45846
## 5 2.553590 -3.94670 -0.184637 6.01789 6.87990 -3.225950 6.665160 2.22558
## 6 5.877120 -2.08898 -0.926312 -1.22491 1.96330 0.463014 0.434334 2.59585
## PC135 PC136 PC137 PC138 PC139 PC140 PC141
## 1 6.168580 7.28387 5.26780 5.154720 -14.558000 -11.21690 -3.177390
## 2 5.670870 -5.08248 -2.18662 2.044120 1.417690 9.29795 5.059950
## 3 -0.851386 -5.30379 -3.09618 -1.034900 0.982222 -1.44086 0.293229
## 4 7.660590 -7.07779 -4.19355 -1.457970 1.627850 -1.37569 0.524640
## 5 -1.756110 5.97924 -6.67101 -1.991330 -0.044384 -9.17410 -2.163030
## 6 -8.366400 -2.51210 6.34664 0.424479 7.110340 -6.57425 8.398450
## PC142 PC143 PC144 PC145 PC146 PC147 PC148
## 1 10.236200 2.390040 -7.038190 -16.688500 4.84877 3.645700 -6.671070
## 2 0.594910 -3.112800 10.418900 -4.044020 5.04455 7.592150 3.161890
## 3 1.979340 0.629465 0.223771 -0.866984 0.80221 -0.454170 -1.926930
## 4 -0.839163 1.467130 -3.051540 4.425190 12.29440 -0.231059 6.784050
## 5 -13.967400 -1.319000 6.324470 0.233821 -7.17052 6.478370 3.014360
## 6 -1.087040 1.165390 -6.596010 12.361500 -1.50361 0.862480 -0.120968
## PC149 PC150 PC151 PC152 PC153 PC154 PC155
## 1 15.05110 8.98300 -6.787160 13.350200 -18.785900 -6.179390 1.70564
## 2 -10.11130 3.26962 -4.521240 -7.059730 6.905980 8.044910 -8.66102
## 3 5.90810 0.30858 -0.566322 -0.634108 -0.687383 0.508402 -1.80392
## 4 1.74516 5.65122 -13.357700 -4.775000 0.869868 5.387900 4.28431
## 5 -7.85455 5.45443 -6.189330 1.201290 2.779680 8.501620 1.53625
## 6 -9.37284 -7.52422 10.340300 -3.922160 -2.171440 10.177200 8.41302
## PC156 PC157 PC158 PC159 PC160 PC161 PC162 PC163
## 1 4.43127 11.51580 -8.66775 -0.926102 -3.29969 -0.721268 -0.773033 -7.2865900
## 2 10.98290 -8.73883 -2.01075 4.993950 -3.83857 0.694734 3.523610 4.4857600
## 3 -1.41340 3.12608 -3.96009 -1.898540 -2.06178 0.206047 0.631220 2.4119400
## 4 0.18277 6.75472 -4.26540 -0.303048 3.43349 5.569890 -1.932970 2.1116300
## 5 -4.28591 -5.16115 6.79925 -8.700190 3.37856 -2.174120 -7.731370 -0.0483597
## 6 1.68728 -2.49246 -7.51101 -0.268009 2.17712 6.286020 13.790500 -2.3983300
## PC164 PC165 PC166 PC167 PC168 PC169 PC170 PC171
## 1 -4.042290 3.09612 7.139980 5.29631 15.89350 -1.790890 5.17591 -12.75090
## 2 0.949113 1.59353 0.260560 4.41583 -11.76880 -6.669640 3.68179 9.27866
## 3 0.571039 2.19855 -0.556776 -2.61268 1.28337 1.033490 -2.10754 1.36703
## 4 0.662403 -5.11803 -8.094840 8.71185 4.09189 -0.327079 3.40476 -6.36834
## 5 3.445760 2.36084 -6.321930 -6.41518 7.20189 15.460300 -7.31604 -12.66380
## 6 6.151660 6.30566 3.107480 9.78858 -0.41181 -5.753420 -11.12380 3.73141
## PC172 PC173 PC174 PC175 PC176 PC177 PC178 PC179
## 1 6.626550 4.31022 10.67870 -0.871897 10.813100 -7.720720 11.86240 -6.99945
## 2 -6.896720 9.61003 -4.35415 2.362180 -2.194610 6.082290 -15.93930 -1.43344
## 3 1.677810 -3.07163 -1.53284 -0.139639 0.330184 0.649722 1.98983 3.26036
## 4 -9.189190 -7.99462 -9.14903 2.832820 -6.777400 -2.835200 -7.81709 8.05522
## 5 -0.174438 -1.91834 -4.66055 -2.991300 2.417320 8.889000 -3.98891 16.35390
## 6 5.611700 9.43890 11.02760 2.003670 5.331450 -11.685700 10.81990 3.41962
## PC180 PC181 PC182 PC183 PC184 PC185 PC186
## 1 11.1457000 7.62780 -1.61661 6.396660 6.979940 -0.5269220 -5.39669
## 2 -1.6096800 -13.01920 3.16703 -3.756260 0.592469 0.0878305 11.62250
## 3 -0.0653278 -2.74602 1.66943 2.216780 3.831330 2.5738400 -5.07528
## 4 -8.9797300 3.08896 -6.96459 5.608270 4.285100 -8.0731200 -6.06966
## 5 2.2272000 1.21650 11.10260 -0.918296 -8.356560 8.5428700 -8.77882
## 6 -2.2279200 3.11474 -12.70440 0.122042 1.832110 -1.7017100 1.12921
## PC187 PC188 PC189 PC190 PC191 PC192 PC193
## 1 -4.560270 -2.926380 -1.45043 4.890190 0.354412 2.58545 6.662430
## 2 2.472280 -0.818201 5.90885 1.899670 -3.406510 -1.58402 -1.803730
## 3 0.258011 -1.236530 -3.15241 -2.158180 -0.372508 -2.66819 0.498575
## 4 2.281540 4.227620 -2.66572 -14.240100 3.695420 3.58747 -4.118160
## 5 8.856470 7.463360 -2.90971 -6.202960 -1.508350 1.10963 4.090170
## 6 4.254910 4.126330 -4.55672 -0.378392 1.295050 -3.39989 -3.821560
## PC194 PC195 PC196 PC197 PC198 PC199 PC200
## 1 4.262380 0.515846 -8.566740 2.014060 -7.084220 4.326450 5.910580
## 2 -0.554089 0.280376 -2.974510 1.616150 -0.590262 -4.865230 1.109370
## 3 -0.882154 2.473390 -4.041740 0.402075 -0.783201 0.202803 3.994180
## 4 -7.119610 -6.236500 16.059800 2.764400 13.181200 0.363498 2.888800
## 5 -4.898250 -1.970200 0.309063 -4.654670 4.239610 2.814490 -2.785550
## 6 1.560230 -3.649350 2.050720 5.056310 0.735307 -5.975880 -0.337571
## PC201 PC202 PC203 PC204 PC205 PC206 PC207
## 1 7.1272300 -2.709230 -1.065420 8.148680 -0.736573 1.4311800 -0.216668
## 2 -7.5391400 1.690920 -2.912910 0.595227 4.683580 0.0228143 -2.257040
## 3 1.3834500 -0.431804 0.522443 -4.524000 -2.378350 -1.7871700 -0.747820
## 4 -7.6226900 0.822273 8.823450 -12.955200 -8.043530 -3.8713500 -0.550714
## 5 3.2634600 1.949570 1.516530 -3.760890 -2.097190 -2.6244100 -2.017730
## 6 -0.0827874 1.875820 7.117900 1.132390 -4.918460 -0.5684990 3.258320
## PC208 PC209 PC210 PC211 PC212 PC213 PC214
## 1 -0.408560 -1.3711900 3.075730 6.839290 1.583770 0.450098 2.60971
## 2 0.766703 0.3673000 -2.081540 -3.843040 -1.047030 -0.814682 2.46263
## 3 -1.005250 1.2908300 1.934540 -0.948855 -0.883671 1.928830 -2.25307
## 4 -1.101920 6.3714700 0.157383 -5.839940 -5.153320 7.563710 -7.11080
## 5 -1.668520 4.9982300 1.800290 2.014730 2.355420 3.840150 -1.30494
## 6 3.027610 0.0241384 2.651030 -2.738640 -3.112610 -1.591160 -4.60890
## PC215 PC216 PC217 PC218 PC219 PC220 PC221
## 1 -0.0997521 -2.36505000 -0.738434 1.626230 1.795760 -0.855020 1.8545900
## 2 -2.0137300 1.40281000 -1.899280 0.443710 0.176487 -0.848109 -0.5974610
## 3 0.3588550 0.00104404 -0.707371 -1.108600 -0.352831 0.840657 -0.0821584
## 4 1.9304800 -4.79745000 1.020960 1.000170 -0.446825 0.282893 0.3793520
## 5 -0.1964210 -0.18083100 0.211709 -0.127899 -1.389970 1.006710 -0.2298240
## 6 2.6655000 -5.98027000 -1.729570 -0.971102 0.306753 1.484090 1.4648600
## PC222 PC223 PC224 PC225 PC226 PC227 PC228
## 1 -1.688840 -1.236770 1.167120 1.131090 -0.581292 0.450882 -0.3138020
## 2 0.761055 0.368858 -0.564876 -0.278662 -1.976160 0.377243 0.5783510
## 3 0.438190 -1.659280 0.202303 0.095516 -0.731623 0.674557 2.7984000
## 4 0.575892 -0.185816 -3.310390 0.587935 1.368900 -1.330620 0.0518203
## 5 0.150151 -1.831350 1.341060 0.262969 0.851154 1.275190 0.2684870
## 6 -0.398876 -0.310474 -2.188210 -1.299870 0.432901 0.344625 0.2063330
## PC229 PC230 PC231 PC232 PC233 PC234 PC235
## 1 -1.071550 0.332864 -0.7154830 1.274580 0.193993 0.219649 0.2712430
## 2 0.768208 0.574145 0.0912875 -0.846310 -0.134618 -0.090734 0.0277827
## 3 6.783920 -10.266800 8.0236800 -30.847300 2.498060 -1.493970 -7.8576400
## 4 -1.678850 -0.506855 -0.8022830 0.985217 0.125841 0.820878 0.2707700
## 5 -0.859872 -1.012110 0.5197600 -0.185790 -0.584776 -0.220435 -0.6896720
## 6 0.874993 0.138532 -0.7522170 0.807094 0.633243 -0.483935 -0.1552350
## PC236 PC237 PC238 PC239 PC240 PC241 PC242
## 1 0.457339 0.1643130 0.3506810 0.53345900 0.00789868 0.0524563 NaN
## 2 0.075055 -0.1686190 0.0808541 -0.18602300 0.28178700 0.1454950 NaN
## 3 -5.805560 2.8301200 1.2021700 -0.16667100 -0.26716700 -0.0988927 NaN
## 4 0.314064 0.0215140 0.1282270 -0.80934600 -0.07242090 -0.2257470 NaN
## 5 -0.315354 0.0146972 0.3423180 0.00204201 0.04326190 0.0915903 NaN
## 6 -0.217331 -0.0569785 -0.4812990 -0.21896700 -0.00764387 0.0491660 NaN
## Individual Pop_City Country Latitude Longitude Continent Year Region
## 1 802 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 2 801 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 3 806 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 4 803 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 5 804 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## 6 805 Durres Albania 41.29704 19.50373 Europe 2018 Southern Europe
## Marker order
## 1 SNPs 33
## 2 SNPs 33
## 3 SNPs 33
## 4 SNPs 33
## 5 SNPs 33
## 6 SNPs 33
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/PCA_lea_europe_Set3_pc1_pc2.pdf"
),
width = 8,
height = 6,
units = "in"
)
####
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/PCA_lea_europe_Set3_pc1_pc3.pdf"
),
width = 8,
height = 6,
units = "in"
)
We will do 5 repetitions
# set output dir
# main options
# K = number of ancestral populations
# entropy = TRUE computes the cross-entropy criterion, # CPU = 4 is the number of CPU used (hidden input) project = NULL
project = snmf(
genotype,
K = 1:20,
project = "new",
repetitions = 5,
percentage = 0.25,
iterations = 500,
CPU = 10,
entropy = TRUE
)
project = load.snmfProject("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmfProject")
Cross entropy
# Open a new pdf file
pdf(here("output","europe","lea","overlap","lea_cross_entropy_europe_overlap.pdf"), width = 6, height = 4)
# Create your plot
plot(project, col = "blue", pch = 19, cex = 1.2)
# Close the pdf file
dev.off()
## png
## 2
Summary of project
## $repetitions
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7 K = 8 K = 9
## with cross-entropy 5 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5 5
## K = 10 K = 11 K = 12 K = 13 K = 14 K = 15 K = 16 K = 17
## with cross-entropy 5 5 5 5 5 5 5 5
## without cross-entropy 0 0 0 0 0 0 0 0
## total 5 5 5 5 5 5 5 5
## K = 18 K = 19 K = 20
## with cross-entropy 5 5 5
## without cross-entropy 0 0 0
## total 5 5 5
##
## $crossEntropy
## K = 1 K = 2 K = 3 K = 4 K = 5 K = 6 K = 7
## min 0.9332293 0.9251151 0.9187669 0.9121480 0.9088385 0.9051482 0.9032592
## mean 0.9334841 0.9254324 0.9191965 0.9125867 0.9093271 0.9056410 0.9036056
## max 0.9341471 0.9261314 0.9198113 0.9130778 0.9099866 0.9064215 0.9041027
## K = 8 K = 9 K = 10 K = 11 K = 12 K = 13 K = 14
## min 0.9025382 0.9025389 0.9024926 0.9025329 0.9033621 0.9049762 0.9065019
## mean 0.9030838 0.9030723 0.9031444 0.9039814 0.9046951 0.9063263 0.9078695
## max 0.9043053 0.9038458 0.9037498 0.9060784 0.9061042 0.9075991 0.9091732
## K = 15 K = 16 K = 17 K = 18 K = 19 K = 20
## min 0.9081570 0.9100895 0.9130928 0.9150371 0.9179452 0.9197988
## mean 0.9098571 0.9115015 0.9138309 0.9161023 0.9194263 0.9208850
## max 0.9120404 0.9137447 0.9144777 0.9174135 0.9204794 0.9224509
# get the cross-entropy of all runs for K = 10
ce10 = cross.entropy(project, K = 10)
ce10 #run 3 is best for k=10
## K = 10
## run 1 0.9029170
## run 2 0.9037498
## run 3 0.9024926
## run 4 0.9037463
## run 5 0.9028161
## K = 8
## run 1 0.9027781
## run 2 0.9043053
## run 3 0.9025382
## run 4 0.9032114
## run 5 0.9025859
k=10 appears to have the lowest Cross-entropy, but k=8 is very close. We will plot both
color_palette <-
c(
"chocolate4",
"blue",
"green4",
"purple",
"green",
"#B22222",
"#FF8C1A",
"#F49AC2",
"yellow2",
"#FFB347"
)
Mean admixture by country using ggplot
best = which.min(cross.entropy(project, K = 10)) #3
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
library(reshape2)
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 10, run = best))
# Create a named vector to map countries to regions
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:10)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=14.") +
# scale_fill_manual(values = color) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) + # Remove Region prefix from labels
scale_fill_manual(values = color_palette)
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_admixture_by_country_overlap_Set3_k10.pdf"
),
width = 10,
height = 7,
units = "in"
)
## [1] 3
change to correct matrix
leak10 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmf/K10/run3/r2_0.01_b_overlap_r3.10.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak10)
## # A tibble: 6 × 10
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.19e-2 1.00e-4 1.59e-2 1.73e-2 0.0642 5.05e-2 1.33e-2 0.734 1.36e-3 5.19e-2
## 2 2.89e-2 1.00e-4 7.38e-3 2.86e-2 0.0148 1.00e-4 5.35e-3 0.838 1.00e-4 7.66e-2
## 3 9.99e-5 9.99e-5 9.99e-5 9.99e-5 0.00954 5.95e-2 9.99e-5 0.897 3.29e-2 9.99e-5
## 4 9.35e-2 2.17e-2 4.28e-2 5.71e-2 0.0546 2.97e-2 1.68e-2 0.657 1.00e-4 2.69e-2
## 5 1.29e-1 1.74e-2 3.53e-2 2.69e-2 0.0542 5.47e-2 1.86e-2 0.600 2.98e-2 3.42e-2
## 6 5.74e-2 3.10e-2 2.48e-2 1.00e-4 0.130 3.48e-2 4.01e-2 0.665 1.00e-4 1.69e-2
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5
## 1 1065 SOC 0.051922800 0.000099991 0.015868100 0.017267000 0.06424650
## 2 1066 SOC 0.028928900 0.000099973 0.007376850 0.028551100 0.01480000
## 3 1067 SOC 0.000099946 0.000099946 0.000099946 0.000099946 0.00954467
## 4 1068 SOC 0.093526200 0.021670200 0.042770800 0.057149600 0.05463680
## 5 1069 SOC 0.128679000 0.017401400 0.035268600 0.026883400 0.05418260
## 6 1070 SOC 0.057381800 0.030952700 0.024805200 0.000099982 0.13029200
## X6 X7 X8 X9 X10
## 1 0.050529300 0.013282200 0.733502 0.001362700 0.051919600
## 2 0.000099973 0.005353340 0.838124 0.000099973 0.076566000
## 3 0.059463600 0.000099946 0.897442 0.032949900 0.000099946
## 4 0.029740400 0.016780200 0.656749 0.000099991 0.026876300
## 5 0.054684100 0.018560700 0.600328 0.029819000 0.034192700
## 6 0.034762800 0.040107000 0.664566 0.000099982 0.016932600
Rename the columns
# Rename the columns starting from the third one
leak10 <- leak10 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak10)
## ind pop v1 v2 v3 v4 v5
## 1 1065 SOC 0.051922800 0.000099991 0.015868100 0.017267000 0.06424650
## 2 1066 SOC 0.028928900 0.000099973 0.007376850 0.028551100 0.01480000
## 3 1067 SOC 0.000099946 0.000099946 0.000099946 0.000099946 0.00954467
## 4 1068 SOC 0.093526200 0.021670200 0.042770800 0.057149600 0.05463680
## 5 1069 SOC 0.128679000 0.017401400 0.035268600 0.026883400 0.05418260
## 6 1070 SOC 0.057381800 0.030952700 0.024805200 0.000099982 0.13029200
## v6 v7 v8 v9 v10
## 1 0.050529300 0.013282200 0.733502 0.001362700 0.051919600
## 2 0.000099973 0.005353340 0.838124 0.000099973 0.076566000
## 3 0.059463600 0.000099946 0.897442 0.032949900 0.000099946
## 4 0.029740400 0.016780200 0.656749 0.000099991 0.026876300
## 5 0.054684100 0.018560700 0.600328 0.029819000 0.034192700
## 6 0.034762800 0.040107000 0.664566 0.000099982 0.016932600
Import Sample Locations
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak10 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FFB347",
"purple",
"#B22222",
"#F49AC2",
"green",
"#FF8C1A",
"yellow2",
"chocolate4",
"green4",
"blue")
# Generate all potential variable names
all_variables <- paste0("v", 1:10)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=10.\n LEA inference for 20,968 SNPs (MAF 1%) overlap.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
color_palette <-
c(
"green",
"#F49AC2",
"#B22222",
"purple",
"chocolate4",
"#FF8C1A",
"yellow2",
"blue"
)
Mean admixture by country using ggplot
best = which.min(cross.entropy(project, K = 8)) #3
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
library(reshape2)
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 8, run = best))
# Create a named vector to map countries to regions
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:8)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=8.") +
# scale_fill_manual(values = color) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) + # Remove Region prefix from labels
scale_fill_manual(values = color_palette)
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_admixture_by_country_overlap_Set3_k8.pdf"
),
width = 10,
height = 7,
units = "in"
)
## [1] 3
change to correct matrix
leak8 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmf/K8/run3/r2_0.01_b_overlap_r3.8.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak8)
## # A tibble: 6 × 8
## X1 X2 X3 X4 X5 X6 X7 X8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0352 0.741 0.0802 0.0217 0.0589 0.00872 0.0151 0.0395
## 2 0.0245 0.844 0.00383 0.000100 0.00987 0.0356 0.000100 0.0822
## 3 0.000100 0.904 0.000100 0.0306 0.0450 0.0184 0.00138 0.000100
## 4 0.0657 0.654 0.0609 0.000100 0.113 0.0714 0.0162 0.0188
## 5 0.0706 0.640 0.0787 0.0240 0.129 0.0183 0.000100 0.0391
## 6 0.0397 0.689 0.123 0.00140 0.0687 0.0258 0.0314 0.0205
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4 X5 X6
## 1 1065 SOC 0.035186300 0.740781 0.080211600 0.021674300 0.05888470 0.00872346
## 2 1066 SOC 0.024454300 0.843784 0.003827620 0.000099982 0.00986846 0.03563570
## 3 1067 SOC 0.000099973 0.904298 0.000099973 0.030627700 0.04504270 0.01835360
## 4 1068 SOC 0.065653000 0.654353 0.060883800 0.000099991 0.11257200 0.07144340
## 5 1069 SOC 0.070633500 0.639905 0.078706900 0.024010000 0.12916100 0.01833760
## 6 1070 SOC 0.039741000 0.688908 0.123482000 0.001400260 0.06872810 0.02578520
## X7 X8
## 1 0.015088200 0.039450200
## 2 0.000099982 0.082230000
## 3 0.001378010 0.000099973
## 4 0.016220900 0.018774200
## 5 0.000099991 0.039146300
## 6 0.031432300 0.020522800
Rename the columns
# Rename the columns starting from the third one
leak8 <- leak8 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak8)
## ind pop v1 v2 v3 v4 v5 v6
## 1 1065 SOC 0.035186300 0.740781 0.080211600 0.021674300 0.05888470 0.00872346
## 2 1066 SOC 0.024454300 0.843784 0.003827620 0.000099982 0.00986846 0.03563570
## 3 1067 SOC 0.000099973 0.904298 0.000099973 0.030627700 0.04504270 0.01835360
## 4 1068 SOC 0.065653000 0.654353 0.060883800 0.000099991 0.11257200 0.07144340
## 5 1069 SOC 0.070633500 0.639905 0.078706900 0.024010000 0.12916100 0.01833760
## 6 1070 SOC 0.039741000 0.688908 0.123482000 0.001400260 0.06872810 0.02578520
## v7 v8
## 1 0.015088200 0.039450200
## 2 0.000099982 0.082230000
## 3 0.001378010 0.000099973
## 4 0.016220900 0.018774200
## 5 0.000099991 0.039146300
## 6 0.031432300 0.020522800
Import Sample Locations
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak8 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"yellow2",
"purple",
"green",
"#F49AC2",
"#FF8C1A",
"chocolate4",
"blue",
"#B22222"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:8)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=8.\n LEA inference for 20,968 SNPs (MAF 1%) overlap.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Mean admixture by country using ggplot
best = which.min(cross.entropy(project, K = 4)) #3
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
library(reshape2)
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 4, run = best))
# Create a named vector to map countries to regions
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:10)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=4.") +
# scale_fill_manual(values = color) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) + # Remove Region prefix from labels
scale_fill_manual(values = color_palette)
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_admixture_by_country_overlap_Set3_k4.pdf"
),
width = 10,
height = 7,
units = "in"
)
## [1] 5
change to correct matrix
leak4 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmf/K4/run5/r2_0.01_b_overlap_r5.4.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak4)
## # A tibble: 6 × 4
## X1 X2 X3 X4
## <dbl> <dbl> <dbl> <dbl>
## 1 0.749 0.0751 0.0128 0.163
## 2 0.872 0.0848 0.0199 0.0231
## 3 0.884 0.0576 0.0144 0.0439
## 4 0.691 0.157 0.0404 0.112
## 5 0.659 0.166 0.0181 0.157
## 6 0.693 0.0460 0.0464 0.215
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3 X4
## 1 1065 SOC 0.748651 0.0751187 0.0127797 0.1634510
## 2 1066 SOC 0.872172 0.0847543 0.0199333 0.0231402
## 3 1067 SOC 0.884139 0.0575614 0.0143838 0.0439161
## 4 1068 SOC 0.690848 0.1568060 0.0403530 0.1119930
## 5 1069 SOC 0.659272 0.1660700 0.0181143 0.1565430
## 6 1070 SOC 0.693111 0.0459925 0.0463633 0.2145330
Rename the columns
# Rename the columns starting from the third one
leak4 <- leak4 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak4)
## ind pop v1 v2 v3 v4
## 1 1065 SOC 0.748651 0.0751187 0.0127797 0.1634510
## 2 1066 SOC 0.872172 0.0847543 0.0199333 0.0231402
## 3 1067 SOC 0.884139 0.0575614 0.0143838 0.0439161
## 4 1068 SOC 0.690848 0.1568060 0.0403530 0.1119930
## 5 1069 SOC 0.659272 0.1660700 0.0181143 0.1565430
## 6 1070 SOC 0.693111 0.0459925 0.0463633 0.2145330
Import Sample Locations
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak4 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FF8C1A",
"#77DD77",
"yellow2",
"#1E90FF" )
# Generate all potential variable names
all_variables <- paste0("v", 1:4)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=4.\n LEA inference for 20,968 SNPs (MAF 1%) overlap.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
Mean admixture by country using ggplot
best = which.min(cross.entropy(project, K = 3))
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
library(reshape2)
# Extract ancestry coefficients
Q_values <- as.data.frame(Q(project, K = 3, run = best))
# Create a named vector to map countries to regions
# Add individual IDs and pops ids
Q_values$ind <- inds
Q_values$pop <- pops
# Melt the data frame for plotting
Q_melted <- melt(Q_values, id.vars = c("ind", "pop"))
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
Q_joined <- Q_joined |>
mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country
Q_ordered <- Q_joined |>
arrange(Region, Country) |>
mutate(Region_Country = factor(Region_Country, levels = unique(Region_Country)))
# Group by Country and calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(Region_Country, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <- data.frame(Region_Country = unique(Q_grouped$Region_Country))
# Add the order of each country to ensure correct placement of borders
borders$order <- 1:nrow(borders) + 0.5 # Shift borders to the right edge of the bars
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(Region_Country) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
# source the plotting function
source(
here("analyses", "my_theme2.R"
)
)
# Generate all potential variable names
all_variables <- paste0("V", 1:10)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge this data frame with Q_grouped_filtered to create the new color column
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create bar chart
ggplot(Q_grouped_filtered, aes(x = Region_Country, y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_segment(data = borders, aes(x = order, xend = order, y = 0, yend = 1, fill = NULL), linetype = "solid", color = "#2C444A") + # Add borders
my_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") + # Hide legend
xlab("") + # Suppress x-axis label
ylab("Ancestry proportions") +
ggtitle("Ancestry matrix") +
labs(caption = "Each bar represents the average ancestry proportions for individuals in a given country for k=4.") +
# scale_fill_manual(values = color) +
scale_x_discrete(labels = function(x) gsub(".*_", "", x)) + # Remove Region prefix from labels
scale_fill_manual(values = color_palette)
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_admixture_by_country_overlap_Set3_k4.pdf"
),
width = 10,
height = 7,
units = "in"
)
####6.2.4.1 Extract ancestry coefficients for k=3
## [1] 3
change to correct matrix
leak3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmf/K3/run3/r2_0.01_b_overlap_r3.3.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.107 0.123 0.770
## 2 0.108 0.0153 0.877
## 3 0.0485 0.0388 0.913
## 4 0.169 0.140 0.691
## 5 0.151 0.181 0.667
## 6 0.129 0.187 0.685
The fam file
fam_file <- here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.fam")
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create column ID
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to the matrix
## ind pop X1 X2 X3
## 1 1065 SOC 0.1069680 0.1226520 0.770380
## 2 1066 SOC 0.1080590 0.0152964 0.876645
## 3 1067 SOC 0.0485005 0.0387749 0.912725
## 4 1068 SOC 0.1688380 0.1403260 0.690836
## 5 1069 SOC 0.1510640 0.1814400 0.667497
## 6 1070 SOC 0.1286890 0.1865620 0.684749
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
# View the first few rows
head(leak3)
## ind pop v1 v2 v3
## 1 1065 SOC 0.1069680 0.1226520 0.770380
## 2 1066 SOC 0.1080590 0.0152964 0.876645
## 3 1067 SOC 0.0485005 0.0387749 0.912725
## 4 1068 SOC 0.1688380 0.1403260 0.690836
## 5 1069 SOC 0.1510640 0.1814400 0.667497
## 6 1070 SOC 0.1286890 0.1865620 0.684749
Import Sample Locations
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
head(sampling_loc)
## # A tibble: 6 × 10
## Pop_City Country Latitude Longitude Continent Abbreviation Year Region Marker
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr> <chr>
## 1 Saint-M… France 45.2 5.77 Europe FRS 2019 Weste… SNPs
## 2 Strasbo… France 48.6 7.75 Europe STS 2019 Weste… SNPs
## 3 Penafiel Portug… 41.2 -8.33 Europe POP 2017 South… SNPs
## 4 Badajoz Spain 38.9 -6.97 Europe SPB 2018 South… SNPs
## 5 San Roq… Spain 36.2 -5.37 Europe SPS 2017 South… SNPs
## 6 Catarro… Spain 39.4 -0.396 Europe SPC 2017 South… SNPs
## # ℹ 1 more variable: order <dbl>
source(
here(
"my_theme3.R"
)
)
# Melt the data frame for plotting
Q_melted <- leak3 |>
pivot_longer(
cols = -c(ind, pop),
names_to = "variable",
values_to = "value"
)
# Join with sampling_loc to get sampling localities
Q_joined <- Q_melted |>
left_join(sampling_loc, by = c("pop" = "Abbreviation"))
# Create a combined variable for Region and Country
#Q_joined <- Q_joined |>
# mutate(Region_Country = interaction(Region, Country, sep = "_"))
# Order the combined variable by Region and Country, then by individual
Q_ordered <- Q_joined |>
arrange(order, ind) |>
mutate(ind = factor(ind, levels = unique(ind))) # Convert ind to a factor with levels in the desired order
# Add labels: country names for the first individual in each country, NA for all other individuals
Q_ordered <- Q_ordered |>
group_by(Country) |>
mutate(label = ifelse(row_number() == 1, as.character(Country), NA))
# Group by individual and variable, calculate mean ancestry proportions
Q_grouped <- Q_ordered |>
group_by(ind, variable) |>
summarise(value = mean(value), .groups = "drop")
# Create a data frame for borders
borders <-
data.frame(Country = unique(Q_ordered$Country))
# Add the order of the last individual of each country to ensure correct placement of borders
borders$order <-
sapply(borders$Country, function(rc)
max(which(Q_ordered$Country == rc))) + 0.5 # Shift borders to the right edge of the bars
# Select only the first occurrence of each country in the ordered data
label_df <- Q_ordered |>
filter(!is.na(label)) |>
distinct(label, .keep_all = TRUE)
# Create a custom label function
label_func <- function(x) {
labels <- rep("", length(x))
labels[x %in% label_df$ind] <- label_df$label
labels
}
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) + 0)
# Calculate the position of population labels and bars
pop_labels <- Q_ordered |>
mutate(Name = paste(pop, Pop_City, sep = " - ")) |>
group_by(pop) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::select(ind, Pop_City, Country, Name) |>
mutate(pos = as.numeric(ind)) # calculate position of population labels
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Calculate the position of lines
border_positions <- Q_ordered |>
group_by(Country) |>
summarise(pos = max(as.numeric(ind)) - 1)
pop_labels_bars <- pop_labels |>
mutate(pos = as.numeric(ind) - .5)
# Function to filter and normalize data
normalize_data <- function(df, min_value) {
df |>
filter(value > min_value) |>
group_by(ind) |>
mutate(value = value / sum(value))
}
# Use the function
Q_grouped_filtered <- normalize_data(Q_grouped, 0.1)
#
color_palette <-
c(
"#FF8C1A",
"yellow2",
"#1E90FF"
)
# Generate all potential variable names
all_variables <- paste0("v", 1:3)
# Create a data frame that pairs each potential variable with a color
color_mapping <- data.frame(variable = all_variables,
color = color_palette[1:length(all_variables)])
# Merge with Q_grouped_filtered
Q_grouped_filtered <- merge(Q_grouped_filtered, color_mapping, by = "variable")
# Create the plot
ggplot(Q_grouped_filtered, aes(x = as.factor(ind), y = value, fill = color)) +
geom_bar(stat = 'identity', width = 1) +
geom_vline(
data = pop_labels_bars,
aes(xintercept = pos),
color = "#2C444A",
linewidth = .2
) +
geom_text(
data = pop_labels,
aes(x = as.numeric(ind), y = 1, label = Name),
vjust = 1.5,
hjust = 0,
size = 2,
angle = 90,
inherit.aes = FALSE
) +
my_theme() +
theme(
axis.text.x = element_text(
angle = 90,
hjust = 1,
size = 12
),
legend.position = "none",
plot.margin = unit(c(3, 0.5, 0.5, 0.5), "cm")
) +
xlab("Admixture matrix") +
ylab("Ancestry proportions") +
labs(caption = "Each bar represents the ancestry proportions for an individual for k=4.\n LEA inference for 20,968 SNPs (MAF 1%) overlap.") +
scale_x_discrete(labels = label_func) +
scale_fill_manual(values = color_palette) +
expand_limits(y = c(0, 1.5))
library(raster) # important to load before tidyverse, otherwise it masks select()
library(tidyverse)
library(scatterpie)
library(sf)
library(ggspatial)
library(ggplot2)
library(dplyr)
library(colorout)
library(here)
library(extrafont)
library(rnaturalearth)
library(rnaturalearthdata)
library(rnaturalearthhires)
library(ggrepel)
library(Cairo)
genotype <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.vcf"
)
d <- read.vcfR(
genotype
)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 8
## header_line: 9
## variant count: 20968
## column count: 251
##
Meta line 8 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 20968
## Character matrix gt cols: 251
## skip: 0
## nrows: 20968
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant 13000
Processed variant 14000
Processed variant 15000
Processed variant 16000
Processed variant 17000
Processed variant 18000
Processed variant 19000
Processed variant 20000
Processed variant: 20968
## All variants processed
Get population and individuals information
inds_full <- attr(d@gt,"dimnames")[[2]]
inds_full <- inds_full[-1]
a <- strsplit(inds_full, '_')
pops <- unname(sapply(a, FUN = function(x) return(as.character(x[1]))))
table(pops)
## pops
## ALD BAR BUL CRO FRS GES GRA GRC ITB ITP ITR MAL POP ROS SER SLO SOC SPB SPC SPM
## 10 12 10 12 12 12 11 10 5 8 16 12 12 11 4 12 12 8 6 6
## SPS STS TUA TUH
## 8 12 9 12
project = load.snmfProject("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmfProject")
Import samples attributes
#data<-read.csv("sampling_loc_all.csv", stringsAsFactors = TRUE)
#write_rds(data, "sampling_loc_all.rds")
sampling_loc <- readRDS(here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/fst/overlap/sampling_loc_overlap.rds"))
pops <- sampling_loc |>
filter(
Continent == "Europe"
) |>
dplyr::select(
Abbreviation, Latitude, Longitude, Pop_City, Country, Region, Year, order
)
head(pops)
## # A tibble: 6 × 8
## Abbreviation Latitude Longitude Pop_City Country Region Year order
## <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 FRS 45.2 5.77 Saint-Martin-d'Her… France Weste… 2019 9
## 2 STS 48.6 7.75 Strasbourg France Weste… 2019 10
## 3 POP 41.2 -8.33 Penafiel Portug… South… 2017 11
## 4 SPB 38.9 -6.97 Badajoz Spain South… 2018 13
## 5 SPS 36.2 -5.37 San Roque Spain South… 2017 14
## 6 SPC 39.4 -0.396 Catarroja Spain South… 2017 15
Select a Q matrix from one of the runs for the best k
## [1] 3
# Extract ancestry coefficients
leak10 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmf/K10/run3/r2_0.01_b_overlap_r3.10.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak10 <- leak10 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak10)
## # A tibble: 6 × 10
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0519 0.0001 0.0159 0.0173 0.0642 0.0505 0.0133 0.734 0.00136 0.0519
## 2 0.0289 0.0001 0.00738 0.0286 0.0148 0.0001 0.00535 0.838 0.0001 0.0766
## 3 0.0001 0.0001 0.0001 0.0001 0.00954 0.0595 0.0001 0.897 0.0330 0.0001
## 4 0.0935 0.0217 0.0428 0.0572 0.0546 0.0297 0.0168 0.657 0.0001 0.0269
## 5 0.129 0.0174 0.0353 0.0269 0.0542 0.0547 0.0186 0.600 0.0298 0.0342
## 6 0.0574 0.0310 0.0248 0.0001 0.130 0.0348 0.0401 0.665 0.0001 0.0169
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak10 <- leak10 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak10$index <- seq_len(nrow(leak10))
# Perform the merge as before
df1 <-
merge(
leak10,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 v4 v5 v6 v7
## 170 SOC 1065 0.051923 0.000100 0.015868 0.017267 0.064246 0.050529 0.013282
## 171 SOC 1066 0.028929 0.000100 0.007377 0.028551 0.014800 0.000100 0.005353
## 172 SOC 1067 0.000100 0.000100 0.000100 0.000100 0.009545 0.059464 0.000100
## 173 SOC 1068 0.093526 0.021670 0.042771 0.057150 0.054637 0.029740 0.016780
## 174 SOC 1069 0.128679 0.017401 0.035269 0.026883 0.054183 0.054684 0.018561
## 175 SOC 1070 0.057382 0.030953 0.024805 0.000100 0.130292 0.034763 0.040107
## v8 v9 v10 Latitude Longitude Pop_City Country
## 170 0.733502 0.001363 0.051920 43.60042 39.74533 Sochi Russia
## 171 0.838124 0.000100 0.076566 43.60042 39.74533 Sochi Russia
## 172 0.897442 0.032950 0.000100 43.60042 39.74533 Sochi Russia
## 173 0.656749 0.000100 0.026876 43.60042 39.74533 Sochi Russia
## 174 0.600328 0.029819 0.034193 43.60042 39.74533 Sochi Russia
## 175 0.664566 0.000100 0.016933 43.60042 39.74533 Sochi Russia
## Region Year order
## 170 Eastern Europe 2021 46
## 171 Eastern Europe 2021 46
## 172 Eastern Europe 2021 46
## 173 Eastern Europe 2021 46
## 174 Eastern Europe 2021 46
## 175 Eastern Europe 2021 46
Q-values for k=10
make a palette with 10 colors
colors2 <-c(
"v1" = "#FF8C1A",
"v2" = "green",
"v3" = "chocolate4",
"v4" = "green4",
"v5" = "yellow2",
"v6" = "#FFB347",
"v7" = "#B22222",
"v8" = "purple",
"v9" = "blue",
"v10" = "#F49AC2"
)
colors2
## v1 v2 v3 v4 v5 v6
## "#FF8C1A" "green" "chocolate4" "green4" "yellow2" "#FFB347"
## v7 v8 v9 v10
## "#B22222" "purple" "blue" "#F49AC2"
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
#selected_countries <- world
# Filtering the world data to include only the countries in your data
selected_countries <- world |>
filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_SNPs_overlap_k10_pie.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10"), color = NA) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_SNPs_overlap_k10_pie_nolabs.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_SNPs_overlap_k10_pie_with_all_countries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
#countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
Select a Q matrix from one of the runs for the best k
## [1] 3
# Extract ancestry coefficients
leak3 <- read_delim(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.snmf/K3/run3/r2_0.01_b_overlap_r3.3.Q"),
delim = " ", # Specify the delimiter if different from the default (comma)
col_names = FALSE,
show_col_types = FALSE
)
# Using mutate and across to round all columns to 4 decimal places
leak3 <- leak3 %>%
mutate(across(everything(), ~ round(.x, 6)))
# Viewing the first few rows to verify the result
head(leak3)
## # A tibble: 6 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 0.107 0.123 0.770
## 2 0.108 0.0153 0.877
## 3 0.0485 0.0388 0.913
## 4 0.169 0.140 0.691
## 5 0.151 0.181 0.667
## 6 0.129 0.187 0.685
The fam file
fam_file <- here(
"/gpfs/gibbs/pi/caccone/mkc54/albo/europe/output/overlap/r2_0.01_b_overlap.fam"
)
# Read the .fam file
fam_data <- read.table(fam_file,
header = FALSE,
col.names = c("FamilyID", "IndividualID", "PaternalID", "MaternalID", "Sex", "Phenotype"))
# View the first few rows
head(fam_data)
## FamilyID IndividualID PaternalID MaternalID Sex Phenotype
## 1 SOC 1065 0 0 0 -9
## 2 SOC 1066 0 0 0 -9
## 3 SOC 1067 0 0 0 -9
## 4 SOC 1068 0 0 0 -9
## 5 SOC 1069 0 0 0 -9
## 6 SOC 1070 0 0 0 -9
Create ID column
# Change column name
colnames(fam_data)[colnames(fam_data) == "IndividualID"] <- "ind"
# Change column name
colnames(fam_data)[colnames(fam_data) == "FamilyID"] <- "pop"
# Select ID
fam_data <- fam_data |>
dplyr::select("ind", "pop")
# View the first few rows
head(fam_data)
## ind pop
## 1 1065 SOC
## 2 1066 SOC
## 3 1067 SOC
## 4 1068 SOC
## 5 1069 SOC
## 6 1070 SOC
Add it to matrix
Rename the columns
# Rename the columns starting from the third one
leak3 <- leak3 |>
rename_with(~paste0("v", seq_along(.x)), .cols = -c(ind, pop))
Merge with pops
# Add an index column to Q_tibble
leak3$index <- seq_len(nrow(leak3))
# Perform the merge as before
df1 <-
merge(
leak3,
pops,
by.x = 2,
by.y = 1,
all.x = T,
all.y = F
) |>
na.omit()
# Order by the index column to ensure the order matches the original Q_tibble
df1 <- df1[order(df1$index),]
# Optionally, you can remove the index column if it's no longer needed
df1$index <- NULL
# Now the rows of df1 should be in the same order as the original Q_tibble
head(df1)
## pop ind v1 v2 v3 Latitude Longitude Pop_City Country
## 170 SOC 1065 0.106968 0.122652 0.770380 43.60042 39.74533 Sochi Russia
## 171 SOC 1066 0.108059 0.015296 0.876645 43.60042 39.74533 Sochi Russia
## 172 SOC 1067 0.048500 0.038775 0.912725 43.60042 39.74533 Sochi Russia
## 173 SOC 1068 0.168838 0.140326 0.690836 43.60042 39.74533 Sochi Russia
## 174 SOC 1069 0.151064 0.181440 0.667497 43.60042 39.74533 Sochi Russia
## 175 SOC 1070 0.128689 0.186562 0.684749 43.60042 39.74533 Sochi Russia
## Region Year order
## 170 Eastern Europe 2021 46
## 171 Eastern Europe 2021 46
## 172 Eastern Europe 2021 46
## 173 Eastern Europe 2021 46
## 174 Eastern Europe 2021 46
## 175 Eastern Europe 2021 46
Q-values for k=3
make a palette with 3 colors
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3"), color = NA) +
geom_text_repel(data = df_mean,
aes(x = Longitude, y = Latitude, label = pop),
size = 3,
box.padding = unit(0.5, "lines"),
max.overlaps = 50) +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()
# #
ggsave(
here("/gpfs/gibbs/pi/caccone/mkc54/albo/scripts/RMarkdowns/output/europe/lea/overlap/LEA_SNPs_overlap_k3_pie_allcountries.pdf"),
width = 12,
height = 6,
units = "in",
device = cairo_pdf
)
world <- ne_countries(scale = "medium", returnclass = "sf")
countries_with_data <- unique(df1$Country)
selected_countries <- world
# Filtering the world data to include only the countries in your data
#selected_countries <- world |>
# filter(admin %in% countries_with_data)
# Calculate mean proportions for each population
df_mean <- df1 |>
group_by(pop) |>
summarise(across(starts_with("v"), \(x) mean(x, na.rm = TRUE)),
Longitude = mean(Longitude),
Latitude = mean(Latitude))
source(
here(
"analyses", "my_theme2.R"
)
)
ggplot() +
geom_sf(data = selected_countries, fill="white") +
geom_scatterpie(data = df_mean,
aes(x = Longitude, y = Latitude, r = 1.5),
cols = c("v1", "v2", "v3"), color = "black") +
scale_fill_manual(values = colors2) +
guides(fill = "none") + # Hide legend
# coord_sf() +
coord_sf(xlim = c(-11, 48), ylim = c(33, 52)) +
my_theme()