1 SESSION INFORMATION
2 PACKAGE, DATA AND FUNCTIONS
3 DATA IMPORTATION
4 DATA CLEANING AND FORMATTING
5 FORMATAGE DES DONNEES GENOTYPES POUR CERVUS
- 5.1 Formatage
- 5.2 Save
6 IDENTIFICATION DES INDIVIDUS A TESTER AVEC CERVUS
7 RAW PARENTAGE COMPARISON
- 7.1 Mother test
- 7.2 Father test
8 ANALYSE CERVUS
- 8.1 Genotype file
- 8.2 Offspring file

Purpose: Analyse des profils génétiques 2016 à 2026
Author: Pierre-Alexandre QUITTET
Contact: pierre-alexandre.quittet@cefe.cnrs.fr
Code created: 25 septembre 2025
Last updated: 25 mars 2026

Dans l’ensemble du document, les codes sont masqués par défaut. Vous pouvez y accéder en cliquant sur le bouton Show situé sur la droite du document.

1 SESSION INFORMATION

Ci-dessous les informations issues de sessionInfo() permettant la reproducibilité du code.

# sessionInfo()

## R version 4.5.0 (2025-04-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=French_France.utf8  LC_CTYPE=French_France.utf8   
## [3] LC_MONETARY=French_France.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=French_France.utf8    
## 
## time zone: Europe/Paris
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.37     R6_2.6.1          fastmap_1.2.0     xfun_0.52        
##  [5] cachem_1.1.0      knitr_1.50        htmltools_0.5.8.1 rmarkdown_2.29   
##  [9] lifecycle_1.0.4   cli_3.6.5         sass_0.4.10       jquerylib_0.1.4  
## [13] compiler_4.5.0    rstudioapi_0.17.1 tools_4.5.0       evaluate_1.0.4   
## [17] bslib_0.9.0       yaml_2.3.10       rlang_1.1.6       jsonlite_2.0.0

2 PACKAGE, DATA AND FUNCTIONS

# PACKAGES
library(RODBC)  # connection to DBMS
library(tidyverse)
library(xlsx)

# ROOT DIRECTORY
dir = "C:/Users/quittet/Documents/marmot-quantitative-genetics"

# IMPORT DATA
load(paste0(dir, "/1_DATA_ANALYSIS/3_Data/mat_info_id.rds"))  # information de capture (use for candidate fathers)
load(paste0(dir, "/1_DATA_ANALYSIS/3_Data/pedigree_data.rds"))  # pédigrée social
load(paste0(dir, "/1_DATA_ANALYSIS/3_Data/pheno.rds"))  # données phéno

# REMOVE DUMMY PARENTS FROM PEDIGREE
pedigree_data[grepl("dummy", pedigree_data$mother), "mother"] <- NA
pedigree_data[grepl("dummy", pedigree_data$father), "father"] <- NA


# LOAD FUNCTIONS
dir <- "C:/Users/quittet/Documents/marmot-quantitative-genetics/PEDIGREE_GENETIQUE/CONSTRUCTION_PROFIL_GENETIQUE"
source(paste0(dir, "/CERVUS_functions.R"))

3 DATA IMPORTATION

On importe les données des différentes tables de la base de données marmotte (http://umr5558-marmottes.univ-lyon1.fr/phpPgAdmin-master/) et les profils lus sur Genemapper. A noter que cette étape nécessite de configurer un accès dsn à la base de données pour pouvoir faire le lien entre la BDD et R via le package RODBC.

channel <- RODBC::odbcConnect(dsn = "PostgreSQL30")  # DSN name given during configuration for BDD access

3.1 Table “parentee_genetique”

# DATA FROM BDD
parentee_genetique <- sqlQuery(channel = channel,
                               query =  paste0("
SELECT
    parentee_genetique.individu_id AS individual_id,
    parentee_genetique.mere_id AS mother,
    parentee_genetique.pere_id AS father,
    parentee_genetique.pere_lod AS father_lod,
    parentee_genetique.pere_delta AS father_delta,
    date_part('year', individu.date_emergence) AS annee_emergence,
    individu.sexe,
    capture.territoire_id
FROM
    parentee_genetique
    LEFT JOIN individu ON parentee_genetique.individu_id = individu.id
    LEFT JOIN capture ON parentee_genetique.individu_id = capture.individu_id
    LEFT JOIN territoire ON territoire.id = capture.territoire_id
    LEFT JOIN zone_etude ON territoire.zone_etude_id = zone_etude.id

WHERE
    zone_etude.nom::text = 'sassiere'::text
                  "))

# FORMATAGE
parentee_genetique <- parentee_genetique[!duplicated(parentee_genetique$individual_id), ]
parentee_genetique <- parentee_genetique[colnames(parentee_genetique) != "territoire_id"]
parentee_genetique <- parentee_genetique |>
mutate(father = if_else(father == "unknow", NA_character_, father))  # replace wrong levels of "father

# SAVE
save(parentee_genetique, file = paste0(dir, "/parentee_genetique.rds"))

# LOAD
load(paste0(dir, "/parentee_genetique.rds"))

3.2 Table “analyse_microsat”

# DATA FROM BDD
microsatellite_data <- sqlQuery(channel = channel,
                             query =  paste0("
SELECT
    analyse_microsat.id AS id_file,
    analyse_microsat.file_genet AS fsa,
    analyse_microsat.individu_id AS individual_id,
    analyse_microsat.marqueur AS marker,
    analyse_microsat.allele_1,
    analyse_microsat.allele_2,
    analyse_microsat.longueur_1,
    analyse_microsat.longueur_2,
    analyse_microsat.hauteur_1,
    analyse_microsat.hauteur_2,
    date_part('year', individu.date_emergence) AS annee_emergence,
    individu.sexe,
    capture.territoire_id
FROM
    analyse_microsat
    LEFT JOIN individu ON analyse_microsat.individu_id = individu.id
    LEFT JOIN capture ON analyse_microsat.individu_id = capture.individu_id
    LEFT JOIN territoire ON territoire.id = capture.territoire_id
    LEFT JOIN zone_etude ON territoire.zone_etude_id = zone_etude.id

WHERE
    zone_etude.nom::text = 'sassiere'::text
                  "))

# FORMATAGE
microsatellite_data <- microsatellite_data[!duplicated(microsatellite_data$id_file), ]  # remove useless doublons
microsatellite_data <- microsatellite_data[colnames(microsatellite_data) != "territoire_id"]

# standardized name of marker (e.g. ms46 = MS46)
microsatellite_data$marker <- toupper(microsatellite_data$marker)


# GET KNOWN ANALYSIS ASSOCIATED WITH FSA
#' on ajoute seulement 2013 / 2015 car les autres données sont en fouilli total... L'idée est de voir si certains mismatchs futurs sont présents dans ces analyses
#' 2013
lst_data_fsa_2013 <- c(list.files(path = paste0(dir, "/FSA/2013/Mix1"), full.names = FALSE),
                  list.files(path = paste0(dir, "/FSA/2013/Mix2"), full.names = FALSE))
microsatellite_data$year_analysis <- NA
microsatellite_data[microsatellite_data$fsa %in% lst_data_fsa_2013, "year_analysis"] <- "2013"

#' 2015
lst_data_fsa_2015 <- c(list.files(path = paste0(dir, "/FSA/2015/Mix1"), full.names = FALSE),
                  list.files(path = paste0(dir, "/FSA/2015/Mix2"), full.names = FALSE))
microsatellite_data[microsatellite_data$fsa %in% lst_data_fsa_2015, "year_analysis"] <- "2015"

# SAVE
save(microsatellite_data, file = paste0(dir, "/microsatellite_data.rds"))

# LOAD
load(paste0(dir, "/microsatellite_data.rds"))

Vérification : On vérifie si on a bien 17 marqueurs

length(unique(microsatellite_data$marker))

## [1] 17

3.3 Profils génétiques lus sur Genemapper

J’importe ici les lectures de profils génétiques .txt (tab) lus avec Genemapper. Ces fichiers .txt sont directement importés depuis Genemapper. Attention à bien ajouter à ces données brutes les mêmes colonnes que moi pour que le script fonctionne (à définir dans Genemapper lors de l’exportation) : “Sample.File” / “Sample.Name” / “Marker” / “Allele.1” / “Allele.2” / “Allele.3” / “Size.1” / “Size.2” / “Size.3” / “Height.1” / “Height.2”.

# AJOUT DES PROFILS DE 2013 / 2015 POUR VERIFICATION
pg_2013 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2013.txt"),
  header = T,
  sep = "\t",
  strip.white = T
)[1:11]

pg_2015 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2015.txt"),
  header = T,
  sep = "\t",
  strip.white = T
)[1:11]

# AJOUT DES PROFILS LUS SUR GENEMAPPER (2016 to 2022)
# read .txt
pg_2017 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2017.txt"),
  header = T,
  sep = ",",
  strip.white = T
)[1:11]  # pg = profil genetique
pg_2020 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2020.txt"),
  header = T,
  sep = ",",
  strip.white = T
)[1:11]
pg_2022 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2022.txt"),
  header = T,
  sep = ",",
  strip.white = T
)[1:11]
pg_2023_PK1_PK2 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2023_PK1_PK2.txt"),
  header = T,
  sep = ",",
  strip.white = T
)[1:11]
pg_2023_PK3_PK4 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2023_PK3_PK4.txt"),
  header = T,
  sep = ",",
  strip.white = T
)[1:11]
pg_2023_PK5 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2023_PK5.txt"),
  header = T,
  sep = ",",
  strip.white = T
)[1:11] 

pg_2026_1 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2026_1.txt"),
  header = T,
  sep = "\t",
  strip.white = T
)[1:11] 

pg_2026_2 <- read.table(
  file = paste0(dir, "/LECTURE_PROFIL/txt/LECTURE_PROFIL_2026_2.txt"),
  header = T,
  sep = "\t",
  strip.white = T
)[1:11] 

# add year of analysis for future stats
pg_2013$year_analysis <- "2013_2"
pg_2015$year_analysis <- "2015_2"
pg_2017$year_analysis <- "2017"
pg_2020$year_analysis <- "2020"
pg_2022$year_analysis <- "2022"
pg_2023_PK1_PK2$year_analysis <- "2023_1"
pg_2023_PK3_PK4$year_analysis <- "2023_2"
pg_2023_PK5$year_analysis <- "2023_3"
pg_2026_1$year_analysis <- "2026"
pg_2026_2$year_analysis <- "2026"

# merge datasets
pg <- rbind(pg_2017,
            pg_2020,
            pg_2022,
            pg_2023_PK1_PK2,
            pg_2023_PK3_PK4,
            pg_2023_PK5,
            pg_2026_1,
            pg_2026_2)

# reformat SampleName columns
pg$Sample.Name <- gsub(pattern = "1_|2_|_2", x = pg$Sample.Name, replacement = "")

# remove white space and replace by NA
pg[] <- lapply(pg, function(x) {
  if (is.character(x)) gsub(" ", "", x) else x
})

pg[pg == ""] <- NA

# rename colnames
colnames(pg) <- c(
  "fsa",
  "individual_id",
  "marker",
  "allele_1",
  "allele_2",
  "allele_3",
  "longueur_1",
  "longueur_2",
  "longueur_3",
  "hauteur_1",
  "hauteur_2",
  "year_analysis"
)

# standardized name of marker (e.g. ms46 = MS46)
pg$marker <- toupper(pg$marker)

# ON SUPPRIME LE MARQUEUR ST10 DE 2023 PK5 car a foiré...
pg[pg$marker == "ST10" & pg$year_analysis == "2023_3", c("allele_1", "allele_2")] <- NA

# QUICK VIEW OF DATA
head(pg)

##                  fsa individual_id marker allele_1 allele_2 allele_3 longueur_1
## 1 1_1360_E07_028.fsa          1360  MA091      173      175     <NA>     173.62
## 2 1_1360_E07_028.fsa          1360  BIBL1       97      107     <NA>      98.87
## 3 1_1360_E07_028.fsa          1360    MS6      158     <NA>     <NA>     158.32
## 4 1_1360_E07_028.fsa          1360 BIBL20      208      216     <NA>     206.82
## 5 1_1360_E07_028.fsa          1360   MS56      106      108     <NA>     107.40
## 6 1_1360_E07_028.fsa          1360   MS47      184      186     <NA>     184.84
##   longueur_2 longueur_3 hauteur_1 hauteur_2 year_analysis
## 1     175.60         NA     10140      6435          2017
## 2     108.16         NA      8785      5704          2017
## 3         NA         NA     24372        NA          2017
## 4     214.53         NA      7420      4567          2017
## 5     109.22         NA     14562     11281          2017
## 6     186.80         NA      7050      4379          2017

Vérification : On vérifie que l’on a bien 17 marqueurs :

length(unique(pg$marker))

## [1] 17

Vérification : Nombre de lecture par analyse

## 
##   2017   2020   2022 2023_1 2023_2 2023_3   2026 
##   1632   1479   1632   1581   1632    816   4896

Vérification : On calcule le nombre d’individu génotypé entre 2016 et 2025 :

length(unique(pg$individual_id))

## [1] 742

D. Allainé avait importé dans la BDD les lectures automatiques GENOSCREEN. Elles sont pleines de fautes car elles n’ont pas été lues directement et beaucoup de marqueurs ne sont pas correctement calibrés. Je supprime ces données pour les remplacer par celle que j’ai vraiment lues.

microsatellite_data <- microsatellite_data[!microsatellite_data$individual_id %in% pg$individual_id, ]

3.4 Table “individu”

Les informations de cette table permettent d’avoir le sexe, l’année d’émergence et la plus ancienne capture de tous les individus.

individu <- sqlQuery(
  channel = channel,
  query = paste0(
    "
WITH captures_uniques AS (
    SELECT
        c.*,
        ROW_NUMBER() OVER (
            PARTITION BY c.individu_id
            ORDER BY c.date_capture ASC
        ) AS rn
    FROM capture c
)
SELECT
    i.id AS individual_id,
    date_part('year', i.date_emergence) AS annee_emergence,
    i.sexe,
    cu.territoire_id,
    date_part('year', cu.date_capture) AS annee_capture
FROM individu i
LEFT JOIN captures_uniques cu
       ON i.id = cu.individu_id AND cu.rn = 1
LEFT JOIN territoire t ON t.id = cu.territoire_id
LEFT JOIN zone_etude z ON z.id = t.zone_etude_id
WHERE z.nom::text = 'sassiere'::text
    "
  ))

individu <- individu[!duplicated(individu$individual_id),]  # remove duplicated row
individu <- individu[colnames(individu) != "territoire_id"]

# SAVE
save(individu, file = paste0(dir, "/individu_microsat.rds"))

# LOAD
load(paste0(dir, "/individu_microsat.rds"))

Vérification : pas de NA dans les sexes des individus

Notre jeu de données contient 1 individu(s) avec sexe == NA…

head(pg[index_temp,])  # due à individual_id : Tnegatif... je supprime ces lignes

##                     fsa individual_id marker allele_1 allele_2 allele_3
## 170  1_1718_E03_012.fsa          1718  BIBL1       97     <NA>     <NA>
## 912  2_1715_P01_001.fsa          1715 BIBL18      147     <NA>     <NA>
## 1654 1_A03_1863_031.fsa          1863   MS47      180     <NA>     <NA>
## 2396 2_A08_1901_064.fsa          1901 BIBL31      157      161     <NA>
## 3138 1_A04_1802_032.fsa          1802    MS6      158     <NA>     <NA>
## 3880 2_A01_1943_015.fsa          1943  MA001     <NA>     <NA>     <NA>
##      longueur_1 longueur_2 longueur_3 hauteur_1 hauteur_2 year_analysis sexe
## 170       98.74         NA         NA      9121        NA          2017    f
## 912      147.89         NA         NA      4567        NA          2017    m
## 1654     180.86         NA         NA      1285        NA          2020    f
## 2396     157.83     161.82         NA     30361     30399          2020    f
## 3138     157.96         NA         NA      9452        NA          2022    m
## 3880         NA         NA         NA        NA        NA          2022    f
##      annee_emergence
## 170             2016
## 912             2016
## 1654            2018
## 2396            2018
## 3138            2017
## 3880              NA

pg <- pg[pg$individual_id != "Tnegatif",]

Ces observations sont issus de l’individu Tnegatif qui doit être un teste de méthode, on supprime cette observation.

Vérification : le nombre d’individus avec des années d’émergence NA

Parmi les individus nouvellement génotypés, 29 n’ont pas de date d’émergence, probablement des individus capturés adultes. Les ID de ces individus sont :

##  [1] 1417 1606 1625 1710 1730 1790 1791 1792 1842 1843 1844 1845 1846 1847 1850
## [16] 1913 1943 1989 2078 2079 2080 2166 2167 2168 2169 2170 2171 2177 2205

3.5 Table “statut”

J’importe cette table pour vérifier le statut reproducteur des individus en cas de doute.

statut_individu <- sqlQuery(
  channel = channel,
  query = paste0(
"SELECT 
    capture.individu_id,
    statut.age_obs,
    statut.statut_reproducteur,
    individu.sexe,
    date_part('year', capture.date_capture) AS annee_capture
    
FROM statut
LEFT JOIN capture ON capture.id = statut.capture_id
LEFT JOIN individu ON individu.id = capture.individu_id
ORDER BY statut.capture_id;"
  ))

# SAVE
save(statut_individu, file = paste0(dir, "/statut_individu.rds"))

# LOAD
load(paste0(dir, "/statut_individu.rds"))

4 DATA CLEANING AND FORMATTING

Nombre d’individu génotypé dans la BDD : 1442

4.1 Formattage

On va :

homogénéiser le codage des homozygotes
identifier et supprimer les allèles et marqueurs non présents dans la référence de Aurélie Cohas (2008)
cas de re-test de certains individus, déterminer quel profil sera conservé
calculer le nombre d’individus n’ayant pas été génotypés.

4.1.1 RECODE DES HOMOZYGOTES

Cette étape est nécessaire car certains haplotypes sont notés 155/0 ou 155/155 ou 155/NA. On homogénise le tout.

# Fonction d'homogénéisation des homozygotes
recoding_homozygotis <- function(x){
  if(!is.na(x[1]) && x[1] != 0 && (is.na(x[2]) || (!is.na(x[2]) && x[2] == 0))){
    x <- c(x[1], x[1])
  } else if (!is.na(x[2]) && x[2] != 0 && (is.na(x[1]) || (!is.na(x[1]) && x[1] == 0))){
    x <- c(x[2], x[2])
  } else if (is.na(x[1]) && (!is.na(x[2]) && x[2] == 0)){
    x <- c(NA, NA)
  } else {
    x <- c(x[1], x[2])
  }
  return(x)
}

# application de la fonction sur les données de la BDD
microsatellite_data[c("allele_1", "allele_2")] <- t(apply(microsatellite_data[c("allele_1", "allele_2")],
                                                          1,
                                                          recoding_homozygotis, simplify = T))

# ... sur les données lues sur Genemapper
pg[c("allele_1", "allele_2")] <- t(apply(pg[c("allele_1", "allele_2")],
                                                          1,
                                                          recoding_homozygotis, simplify = T))

4.1.2 FORMATAGE DU NOM

On homogénise le noms des allèles en enlevant les lettres e.g. 180a 180b deviennent 180 et les “?” des allèles incertains.

pg$allele_1 <- gsub(pattern = "a|b|c|\\?", x = pg$allele_1, replacement = "")
pg$allele_2 <- gsub(pattern = "a|b|c|\\?", x = pg$allele_2, replacement = "")

4.1.3 ALLELES DE REFERENCE

On ne conserve que les allèles présents dans l’appendix de Cohas et al. (2008). On précise NA pour les autres.

# ALLELES DE REFERENCES
cohas_alleles <- list(
  "BIBL1" = c("95", "97", "101", "103", "107", "109", "0"),
  "BIBL18" = c("132", "137", "143", "145", "147", "149", "0"),
  "BIBL20" = c("206", "208", "216", "218", "220", "222", "0"),
  "BIBL31" = c("157", "159", "161", "163", "0"),
  "BIBL4" = c("175", "178", "188", "190", "192", "0"),
  "MA002" = c("265", "271", "279", "281", "283", "0"),
  "MA018" = c("296", "298", "0"),
  "MA066" = c("231", "233", "241", "0"),
  "MA091" = c("159", "167", "169", "171", "173", "175", "177", "179", "188", "0"),
  "MS41" = c("184", "186", "0"),
  "MS45" = c("107", "109", "111", "0"),
  "MS47" = c("176", "180", "182", "184", "186", "188", "190", "0"),
  "MS53" = c("132", "140", "142", "144", "148", "0"),
  "MS56" = c("104", "106", "108", "110", "0"),
  "MS6" = c("142", "158", "160", "0"),
  "ST10" = c("116", "118", "120", "130", "132", "134", "136", "0")
)

On fait le bilan des allèles problématiques (i.e. non-présent dans la référence) par marqueur :

## 
##  ____________________
## MARKER : BIBL1
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : BIBL18
##  ____________________  
##  * Number problematic loci : 1 
##  * Problematic alleles :  133 
## 
##  ____________________
## MARKER : BIBL20
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : BIBL31
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : BIBL4
##  ____________________  
##  * Number problematic loci : 3 
##  * Problematic alleles :  179 186 
## 
##  ____________________
## MARKER : MA002
##  ____________________  
##  * Number problematic loci : 7 
##  * Problematic alleles :  277 267 275 
## 
##  ____________________
## MARKER : MA018
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : MA066
##  ____________________  
##  * Number problematic loci : 3 
##  * Problematic alleles :  235 239 
## 
##  ____________________
## MARKER : MA091
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : MS41
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : MS45
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : MS47
##  ____________________  
##  * Number problematic loci : 5 
##  * Problematic alleles :  178 174 
## 
##  ____________________
## MARKER : MS53
##  ____________________  
##  * Number problematic loci : 3 
##  * Problematic alleles :  138 
## 
##  ____________________
## MARKER : MS56
##  ____________________  
##  * Number problematic loci : 0 
##  * Problematic alleles :   
## 
##  ____________________
## MARKER : MS6
##  ____________________  
##  * Number problematic loci : 8 
##  * Problematic alleles :  156 168 
## 
##  ____________________
## MARKER : ST10
##  ____________________  
##  * Number problematic loci : 2 
##  * Problematic alleles :  128 122

Vérification : Nombre d’observations (i.e. donnée pour un marqueur donné) supprimées

Nombre d’observation(s) supprimée(s) : 32

On regarde les informations relatives à ces observations supprimées :

##    id_file                fsa individual_id marker year_analysis
## 1     3920               <NA>          1009 BIBL18          <NA>
## 2    32943   1327_A15_064.fsa          1327   ST10          2013
## 3    33129   1339_E17_075.fsa          1339  MA002          2013
## 4    34071 1-1393_O06_017.fsa          1393  MA002          <NA>
## 5    35545  S1481_M19_068.fsa          1481   MS53          <NA>
## 6    34228  S1403_C01_013.fsa          1403  BIBL4          <NA>
## 7    34579  S1425_M05_019.fsa          1425   ST10          <NA>
## 8    34775  S1436_C09_045.fsa          1436  MA002          <NA>
## 9    34639  S1428_A24_096.fsa          1428  MA002          <NA>
## 10   35773  S1496_K23_086.fsa          1496  MA002          <NA>
## 11   34292  S1408_K01_005.fsa          1408    MS6          <NA>
## 12   37477 1-1596_M07_020.fsa          1596  MA002          <NA>
## 13   37116   1575_J19_072.fsa          1575   MS53          <NA>
## 14   36111   1516_D05_029.fsa          1516   MS47          <NA>
## 15   36519   1540_D11_046.fsa          1540   MS47          <NA>
## 16   36366   1531_B09_047.fsa          1531   MS47          <NA>
## 17   36483   1538_P09_033.fsa          1538    MS6          <NA>
## 18   41369 1_1623_K01_005.fsa          1623   MS47          <NA>
## 19   41002 2_1624_N01_003.fsa          1624  MA066          <NA>
## 20   41462 1_1624_M01_003.fsa          1624   MS53          <NA>
## 21   41658 1_1637_G05_025.fsa          1637    MS6          <NA>
## 22   41380 1_1635_A05_031.fsa          1635   MS47          <NA>
## 23   41024 2_1647_L07_022.fsa          1647  MA066          <NA>
## 24   41672 1_1651_C09_045.fsa          1651    MS6          <NA>
## 25   40661 1_1652_E09_043.fsa          1652  BIBL4          <NA>
## 26   41684 1_1663_K11_038.fsa          1663    MS6          <NA>
## 27   40675 1_1666_A13_063.fsa          1666  BIBL4          <NA>
## 28   40872 2_1679_N15_052.fsa          1679  MA002          <NA>
## 29   41704 1_1683_E17_075.fsa          1683    MS6          <NA>
## 30   41066 2_1689_B19_080.fsa          1689  MA066          <NA>
## 31   41711 1_1690_C19_078.fsa          1690    MS6          <NA>
## 32   41712 1_1691_E19_076.fsa          1691    MS6          <NA>

Les FSA précisés ne sont disponibles nul part. Il s’agit probablement de test d’ajustement. De plus, les individual ID correspondent environ à 2015, donc probablement lors de la passation de Aurélie à Coraline ?

4.1.4 SUPPRESSION MARQUEUR MA001

Le marqueur n’est pas présent dans les analyses de parenté des papiers de Aurélie. On le supprime de la liste.

# REMOVE MA0001 NON USED IN PARENTAGE ANALYSIS
microsatellite_data <- microsatellite_data[!microsatellite_data$marker == "MA001",]
pg <- pg[!pg$marker == "MA001",]

4.1.5 VERIFICATIONS FINALES

Vérification : Tous les allèles présents dans la BDD sont bien présents dans les allèles de références d’Aurélie Cohas (2008)

Nombre d’allèle(s) non-présent(s) : 0

Vérification : Tous les allèles des données lues sur Genemapper sont bien présents dans les allèles de références d’Aurélie Cohas (2008)

Nombre d’allèle(s) non-présent(s) : 0

Vérification : Tous les allèles présents de la référence de A. Cohas sont dans mes allèles extraits de mes analyses GM

## BIBL182 BIBL206 
##   "137"   "222"

On a les allèles 137 et 222 qui ne sont pas présents, mais qui correspondent à des allèles qui ne sont plus utilisés car plus présent dans le bin Genemapper (i.e. contenant les allèles de références).

Vérification : Après toutes ces modifications, on recompte le nombre de marqueur

length(unique(microsatellite_data$marker))  # BDD

## [1] 16

length(unique(pg$marker))  # GM

## [1] 16

4.1.6 Hauteur de pics et correspondance de la taille des allèles

On enlève les lectures avec une hauteur de pic de moins de 300 de fluorescence. Cela correspond à 313 lectures dans mes analyses et à 250 lectures dans la BDD.

# REMOVE READS BELOW THRESHOLD
pg <- pg |>
  mutate(allele_1 = case_when(
    !is.na(allele_1) &
    !is.na(hauteur_1) &
    ((hauteur_1 < minimal_height)|
       (hauteur_2 < minimal_height)) ~ NA,
    .default = allele_1
  )
  ) |> 
    mutate(
      # propagation du NA si new_value = NA pour l’un des deux
      allele_1 = if_else(
          (is.na(allele_1) | is.na(allele_2)),
        NA,
        allele_1
      ),
      allele_2 = if_else(
          (is.na(allele_1) | is.na(allele_2)),
        NA,
        allele_2
      )
    )

# BDD
# REMOVE READ BELOW THRESHOLD
microsatellite_data <- microsatellite_data |>
  mutate(allele_1 = case_when(
    !is.na(allele_1) &
    !is.na(hauteur_1) &
    ((hauteur_1 < minimal_height)|
       (hauteur_2 < minimal_height)) ~ NA,
    .default = allele_1
  ),
  alle_2 = case_when(
    !is.na(allele_2) &
    !is.na(hauteur_2) &
    ((hauteur_1 < minimal_height)|
       (hauteur_2 < minimal_height)) ~ NA,
    .default = allele_2
  )
  ) |> 
    mutate(
      # propagation du NA si new_value = NA pour l’un des deux
      allele_1 = if_else(
          (is.na(allele_1) | is.na(allele_2)),
        NA,
        allele_1
      ),
      allele_2 = if_else(
          (is.na(allele_1) | is.na(allele_2)),
        NA,
        allele_2
      )
    )

4.1.6.1 Longueur attendue et observée

On vérifie que la catégorisation d’un allèle (fait sur Genemapper) match bien la distribution des longueurs en pB (axe \(y\)) et les valeurs de références (lignes pointillées), au décalage du séquenceur près. Cela permet d’identifier rapidement les outliers.

4.1.6.1.1 Mes lectures

J’ai déjà corrigé les erreurs “grossières” d’assignation repérées directement dans les données brutes.

microsatellite_data$allele_1 <- as.character(microsatellite_data$allele_1)
microsatellite_data$allele_2 <- as.character(microsatellite_data$allele_2)

marker_name_temp = "BIBL31"
# VISUALISE READ DISTRIBUTION PER MARKER
for(marker_name_temp in marker_name){
    df_plot <- pg |>
  filter(marker == marker_name_temp) |>
  pivot_longer(
    cols = c(allele_1, allele_2, longueur_1, longueur_2),
    names_to = c(".value", "nb"),
    names_pattern = "(.*)_(.*)"
  ) |>
  filter(!is.na(allele)) |>
  arrange(allele, longueur, year_analysis) |>
  as.data.frame()

# ADD DUMMY X AXIS
df_plot$order <- seq_len(nrow(df_plot))

# PLOT
print(
ggplot(df_plot, aes(x = order, y = longueur, col = allele, shape = year_analysis)) +
  geom_hline(aes(yintercept = as.numeric(allele), col = allele),
             linetype = 2, linewidth = 0.5, show.legend = FALSE) +
  geom_jitter(width = 50) +

  theme_classic(base_size = 15) +
  theme(axis.text.x = element_blank()) +
  labs(
    x = "",
    y = "Longueur (pb)",
    title = paste0(marker_name_temp)
  )
)
}

Tout semble bon !

4.1.6.1.2 BDD

Fonction utilisée pour corriger les erreurs

# FONCTION DE CORRECTION
#' ... NOTE : si on précise "NA" à new_value, alors les deux allèles
#' ... prendront NA comme nouvelle valeur
corriger_allele <- function(
  data = microsatellite_data,
  marker_temp,  # marqueur concerné
  allele_value,  # valeur actuelle de l'allèle
  new_value,  # valeur corrigée
  longueur_min,  # intervalle inférieure
  longueur_max  # intervalle supérieure
) {

  data %>%
    mutate(
      # correction individuelle
      allele_1 = if_else(
        marker == marker_temp &
          !is.na(allele_1) & 
          !is.na(longueur_1) &
          allele_1 %in% as.character(allele_value) &
          longueur_1 > longueur_min &
          longueur_1 < longueur_max,
        as.character(new_value),
        allele_1
      ),
      allele_2 = if_else(
        marker == marker_temp &
          allele_2 %in% as.character(allele_value) &
          !is.na(allele_2) & 
          !is.na(longueur_2) &
          longueur_2 > longueur_min &
          longueur_2 < longueur_max,
        as.character(new_value),
        allele_2
      )
    ) %>%
    mutate(
      # propagation du NA si new_value = NA pour l’un des deux
      allele_1 = if_else(
        marker == marker_temp &
          (is.na(allele_1) | is.na(allele_2)),
        NA_character_,
        allele_1
      ),
      allele_2 = if_else(
        marker == marker_temp &
          (is.na(allele_1) | is.na(allele_2)),
        NA_character_,
        allele_2
      )
    )
}

2013 et 2015
J’ai déjà corrigé ces erreurs, sans les afficher. Voir le code pour le détail…

#' A REMODIFIER AVEC LA FONCTION, CE SERA PLUS PROPRE. UNE FOIS FAIT INSERER DANS LE CHUNK CI DESSOUS
#' 
# ERREUR MS53 : 2015 : remove outliers
to_remove <- na.omit(microsatellite_data[microsatellite_data$marker == "MS53" & microsatellite_data$year_analysis == 2015 & microsatellite_data$allele_1 == 132 & microsatellite_data$longueur_1 < 132.5, ])
microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR MA091 : 2015 : allèle 159 (rouge) est en réalité 175 (violet)
to_remove <- na.omit(microsatellite_data[microsatellite_data$year_analysis == 2015 & microsatellite_data$allele_1 == 159 & microsatellite_data$longueur_1 > 175, ])
microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), "allele_1"] <- 175

# ERREUR MS47 : 2013 : allèle 184 (turquoise) est en réalité 182 (vert)
to_remove <- na.omit(microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 184 & microsatellite_data$longueur_1 < 184, ])
microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), "allele_1"] <- 182

# ERREUR MS47 : 2013 : allèle 180 (doré) est en réalité 176 (rouge)
to_remove <- na.omit(microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 180 & microsatellite_data$longueur_1 < 180, ])
microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), "allele_1"] <- 176

# ERREUR MS47 : 2013 : remove outliers 182
to_remove <- na.omit(microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 182 & microsatellite_data$longueur_1 > 183, ])

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 182 & microsatellite_data$longueur_1 > 183.46 & !is.na(microsatellite_data$longueur_1) & !is.na(microsatellite_data$allele_1), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR MS45 : 2015 : 2 allèles 107 précisés, mais loin des autres... Je mets NA à ce locus
to_remove <- na.omit(microsatellite_data[microsatellite_data$year_analysis == 2015 & (microsatellite_data$allele_1 == 107 & microsatellite_data$longueur_1 < 107 | microsatellite_data$allele_2 == 107 & microsatellite_data$longueur_2 < 107) , ])

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR MS45 : 2013 - remove outliers 111
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_2 == 111 & microsatellite_data$longueur_2 > 112.45 & !is.na(microsatellite_data$allele_2) & !is.na(microsatellite_data$longueur_2), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR MS45 : 2013 - remove outliers 107
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 107 & microsatellite_data$longueur_1 < 107.68 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR MS41 : 2013 - remove outliers 107
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 184 & microsatellite_data$longueur_1 > 185.53 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

# ERREUR MA018 : 2015 : 1 allèle outlier tout en bas
to_remove <- na.omit(microsatellite_data[microsatellite_data$year_analysis == 2015 & (microsatellite_data$allele_1 == 296 & microsatellite_data$longueur_1 < 294 | microsatellite_data$allele_2 == 296 & microsatellite_data$longueur_2 < 294) , ])

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA



# ERREUR MA018 : 2013 - remove outliers
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 296 & microsatellite_data$longueur_1 < 294.74 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR MA002 : 2013 - remove outliers 271
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 271 & microsatellite_data$longueur_1 > 274.4 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

# ERREUR MA002 : 2013 - remove outliers 271
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_1 == 279 & microsatellite_data$longueur_1 > 281.3 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2013 & microsatellite_data$allele_2 == 279 & microsatellite_data$longueur_2 > 281.3 & !is.na(microsatellite_data$allele_2) & !is.na(microsatellite_data$longueur_2) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

# ERREUR BIBL31 : 2015 - remove read (only two)
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2015 & microsatellite_data$marker == "BIBL31" & !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR BIBL20 : 2015 - remove outliers 208
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2015 & microsatellite_data$allele_1 == 208 & microsatellite_data$longueur_1 < 206.42 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2015 & microsatellite_data$allele_2 == 208 & microsatellite_data$longueur_2 < 206.42 & !is.na(microsatellite_data$allele_2) & !is.na(microsatellite_data$longueur_2) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA


# ERREUR BIBL20 : 2015 - remove outliers 216
to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2015 & microsatellite_data$allele_1 == 216 & microsatellite_data$longueur_1 < 214.75 & !is.na(microsatellite_data$allele_1) & !is.na(microsatellite_data$longueur_1) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

to_remove <- microsatellite_data[microsatellite_data$year_analysis == 2015 & microsatellite_data$allele_2 == 216 & microsatellite_data$longueur_2 < 214.75 & !is.na(microsatellite_data$allele_2) & !is.na(microsatellite_data$longueur_2) &                            !is.na(microsatellite_data$year_analysis), ]

microsatellite_data[rownames(microsatellite_data) == rownames(to_remove), c("allele_1", "allele_2")] <- NA

BDD

On corrige le reste de la BDD pour lesquelles les données brutes Genemapper sont disponibles.

# BDD
# VISUALISE READ DISTRIBUTION PER MARKER
microsatellite_data$allele_1 <- as.character(microsatellite_data$allele_1)
microsatellite_data$allele_2 <- as.character(microsatellite_data$allele_2)

for(marker_name_temp in marker_name){
    df_plot <- microsatellite_data |>
  filter(marker == marker_name_temp) |>
  pivot_longer(
    cols = c(allele_1, allele_2, longueur_1, longueur_2),
    names_to = c(".value", "nb"),
    names_pattern = "(.*)_(.*)"
  ) |>
  filter(!is.na(allele)) |>
  filter(!is.na(allele) & !is.na(longueur)) |>
  arrange(allele, longueur) |> 
  as.data.frame()

# ADD DUMMY X AXIS
df_plot$order <- seq_len(nrow(df_plot))

# PLOT
print(
ggplot(df_plot, aes(x = order, y = longueur, col = allele)) +
  geom_hline(aes(yintercept = as.numeric(allele), col = allele),
             linetype = 2, linewidth = 0.5, show.legend = FALSE) +
  geom_jitter() +

  theme_classic(base_size = 15) +
  theme(axis.text.x = element_blank()) +
  labs(
    x = "",
    y = "Longueur (pb)",
    title = paste0(marker_name_temp)
  )
)
}

Là il y a pas mal d’erreurs… Je corrige.

Je ne sais pas trop quoi faire avec MA066, on remarque qu’il y a une grosse bizzarerie avec des allèles 233 en dessous des allèles 231… A voir si on les reclasses tous ou non. Je modulerai ce choix selon les erreurs avec les mères, en post-hoc.

# ST10
microsatellite_data <- corriger_allele(marker_temp = "ST10", allele_value = 118, new_value = 134, longueur_min = 134, longueur_max = 140)

microsatellite_data <- corriger_allele(marker_temp = "ST10", allele_value = 120, new_value = NA, longueur_min = 120, longueur_max = 125)

microsatellite_data <- corriger_allele(marker_temp = "ST10", allele_value = 130, new_value = 118, longueur_min = 115, longueur_max = 140)

microsatellite_data <- corriger_allele(marker_temp = "ST10", allele_value = 118, new_value = NA, longueur_min = 120, longueur_max = 140)

microsatellite_data <- corriger_allele(marker_temp = "ST10", allele_value = 136, new_value = NA, longueur_min = 137, longueur_max = 140)


# MS6
microsatellite_data <- corriger_allele(marker_temp = "MS6", allele_value = 160, new_value = 158, longueur_min = 155, longueur_max = 159)

microsatellite_data <- corriger_allele(marker_temp = "MS6", allele_value = 158, new_value = NA, longueur_min = 155, longueur_max = 157.8)

# MS56
microsatellite_data <- corriger_allele(marker_temp = "MS56", allele_value = 110, new_value = 108, longueur_min = 108, longueur_max = 110)

microsatellite_data <- corriger_allele(marker_temp = "MS56", allele_value = 108, new_value = NA, longueur_min = 109.61, longueur_max = 110)

microsatellite_data <- corriger_allele(marker_temp = "MS56", allele_value = 106, new_value = NA, longueur_min = 107.8, longueur_max = 108)


# MS53
microsatellite_data <- corriger_allele(marker_temp = "MS53", allele_value = 142, new_value = 145, longueur_min = 144, longueur_max = 145)

microsatellite_data <- corriger_allele(marker_temp = "MS53", allele_value = 140, new_value = NA, longueur_min = 136, longueur_max = 140.5)

microsatellite_data <- corriger_allele(marker_temp = "MS53", allele_value = 142, new_value = NA, longueur_min = 142, longueur_max = 143)

# MS47
microsatellite_data <- corriger_allele(marker_temp = "MS47", allele_value = 186, new_value = NA, longueur_min = 185, longueur_max = 186)

microsatellite_data <- corriger_allele(marker_temp = "MS47", allele_value = 184, new_value = NA, longueur_min = 182, longueur_max = 184)

microsatellite_data <- corriger_allele(marker_temp = "MS47", allele_value = 180, new_value = NA, longueur_min = 182, longueur_max = 184)

microsatellite_data <- corriger_allele(marker_temp = "MS47", allele_value = 180, new_value = NA, longueur_min = 175, longueur_max = 180)

microsatellite_data <- corriger_allele(marker_temp = "MS47", allele_value = 176, new_value = NA, longueur_min = 177.5, longueur_max = 180)


# MS45
microsatellite_data <- corriger_allele(marker_temp = "MS45", allele_value = 107, new_value = NA, longueur_min = 113, longueur_max = 114)

microsatellite_data <- corriger_allele(marker_temp = "MS45", allele_value = 107, new_value = 109, longueur_min = 109, longueur_max = 110)

microsatellite_data <- corriger_allele(marker_temp = "MS45", allele_value = 109, new_value = NA, longueur_min = 110.35, longueur_max = 111)

microsatellite_data <- corriger_allele(marker_temp = "MS45", allele_value = 111, new_value = NA, longueur_min = 111, longueur_max = 111.7)

# MS41
microsatellite_data <- corriger_allele(marker_temp = "MS41", allele_value = 184, new_value = NA, longueur_min = 185.6, longueur_max = 186)

# MA091
microsatellite_data <- corriger_allele(marker_temp = "MA091", allele_value = 159, new_value = 175, longueur_min = 175, longueur_max = 176)

microsatellite_data <- corriger_allele(marker_temp = "MA091", allele_value = 159, new_value = 173, longueur_min = 173, longueur_max = 175)

# MA066
microsatellite_data <- corriger_allele(marker_temp = "MA066", allele_value = 241, new_value = 231, longueur_min = 233.5, longueur_max = 235)

microsatellite_data <- corriger_allele(marker_temp = "MA066", allele_value = 241, new_value = NA, longueur_min = 241, longueur_max = 242)

microsatellite_data <- corriger_allele(marker_temp = "MA066", allele_value = 231, new_value = NA, longueur_min = 200, longueur_max = 233)

microsatellite_data <- corriger_allele(marker_temp = "MA066", allele_value = 233, new_value = NA, longueur_min = 232, longueur_max = 234)

# MA018
microsatellite_data <- corriger_allele(marker_temp = "MA018", allele_value = 298, new_value = NA, longueur_min = 298, longueur_max = 303)

microsatellite_data <- corriger_allele(marker_temp = "MA018", allele_value = 298, new_value = NA, longueur_min = 296, longueur_max = 296.3)

microsatellite_data <- corriger_allele(marker_temp = "MA018", allele_value = 296, new_value = NA, longueur_min = 294, longueur_max = 294.7)

microsatellite_data <- corriger_allele(marker_temp = "MA018", allele_value = 296, new_value = NA, longueur_min = 295.3, longueur_max = 296)


# MA002
microsatellite_data <- corriger_allele(marker_temp = "MA002", allele_value = 279, new_value = 281, longueur_min = 283, longueur_max = 283.6)

microsatellite_data <- corriger_allele(marker_temp = "MA002", allele_value = 279, new_value = 271, longueur_min = 271, longueur_max = 275)

microsatellite_data <- corriger_allele(marker_temp = "MA002", allele_value = 279, new_value = NA, longueur_min = 281.3, longueur_max = 282)

microsatellite_data <- corriger_allele(marker_temp = "MA002", allele_value = 281, new_value = NA, longueur_min = 283.05, longueur_max = 284)

microsatellite_data <- corriger_allele(marker_temp = "MA002", allele_value = 271, new_value = NA, longueur_min = 274.7, longueur_max = 275)

# BIBL4
microsatellite_data <- corriger_allele(marker_temp = "BIBL4", allele_value = 188, new_value = 175, longueur_min = 175, longueur_max = 176)

microsatellite_data <- corriger_allele(marker_temp = "BIBL4", allele_value = 190, new_value = 188, longueur_min = 188, longueur_max = 190)

microsatellite_data <- corriger_allele(marker_temp = "BIBL4", allele_value = c(188, 190, 192), new_value = NA, longueur_min = 179, longueur_max = 188)

# BIBL31
#'... Marqueur compliqué... Beaucoup de variance dans les longueurs
microsatellite_data <- corriger_allele(marker_temp = "BIBL31", allele_value = 161, new_value = 157, longueur_min = 157, longueur_max = 158)

microsatellite_data <- corriger_allele(marker_temp = "BIBL31", allele_value = 159, new_value = 161, longueur_min = 161, longueur_max = 162.3)


# BIBL20
microsatellite_data <- corriger_allele(marker_temp = "BIBL20", allele_value = 216, new_value = NA, longueur_min = 215.8, longueur_max = 216)

microsatellite_data <- corriger_allele(marker_temp = "BIBL20", allele_value = 208, new_value = NA, longueur_min = 206, longueur_max = 206.5)

# BIBL18
microsatellite_data <- corriger_allele(marker_temp = "BIBL18", allele_value = 149, new_value = 145, longueur_min = 145, longueur_max = 147)

microsatellite_data <- corriger_allele(marker_temp = "BIBL18", allele_value = 149, new_value = NA, longueur_min = 149, longueur_max = 149.85)  # Pas sûr de les enlever ceux là, mais décalage justifiable

# BIBL1
microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 101, new_value = 107, longueur_min = 107, longueur_max = 109)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 103, new_value = 101, longueur_min = 101, longueur_max = 103)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 97, new_value = 95, longueur_min = 97, longueur_max = 98)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 101, new_value = NA, longueur_min = 103, longueur_max = 104)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 97, new_value = NA, longueur_min = 99.6, longueur_max = 101)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = c(107, 109), new_value = NA, longueur_min = 108.95, longueur_max = 109.5)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 101, new_value = NA, longueur_min = 101, longueur_max = 102.2)

microsatellite_data <- corriger_allele(marker_temp = "BIBL1", allele_value = 107, new_value = NA, longueur_min = 107, longueur_max = 107.79)


# sum(is.na(a$allele_1))
# sum(is.na(microsatellite_data$allele_1))

On reverifie que tout est ok ! :

4.2 Genotype re-test

Certains individus ont été génotypé plusieurs fois, et les résultats des deux analyses sont parfois différents. Pour choisir quelle analyse prendre, je vais d’abord voir, NA non-inclus, si les résultats par marqueurs test/retest sont congruents. En cas de mismatch, j’ai ensuite réalisé des filtres censés retenir l’allèle le plus vraisembable par marqueur.

4.2.1 Merge des données BDD et des données Genemapper

On merge les deux dataframes issus (1) de mes analyses (2016 à 2026) et (2) celle de la BDD \(<\) 2016 car certains individus ont pu être re-testés dans le batch que j’ai analysé.

# artificially add unique id_file to pg
pg$id_file <- paste0("000", 1:nrow(pg))

# ne garder que les colonnes en commun mais en conservant year_analysis
pg <- pg[colnames(pg) %in% colnames(microsatellite_data)]

# reorder columns
pg <- pg[colnames(microsatellite_data)[colnames(microsatellite_data) %in% colnames(pg)]]

# merging
pg_all <- rbind(pg, microsatellite_data[colnames(microsatellite_data)[colnames(microsatellite_data) %in% colnames(pg)]])

4.2.2 Individus re-testés de puis le début de suivi

index_temp_retest <- which(duplicated(paste0(pg_all$individual_id, pg_all$marker)))  # row with duplicated marker x individual id
id_temp_retest <- unique(pg_all[index_temp_retest, "individual_id"])

Vérification : Nombre d’individu retesté

Nombre d’individu re-testé : 91.

Leur ID est :

##  [1]  137  832  849  850  864  888  904 1005 1021 1061 1278 1279 1296 1297 1300
## [16] 1303 1311 1312 1316 1319 1327 1336 1345 1352 1355 1359 1361 1375 1376 1381
## [31] 1382 1390 1399 1405 1455 1470 1481 1520 1527 1551 1575 1598 1714 1715 1716
## [46] 1717 1718 1723 1731 1733 1763 1770 1778 1802 1830 1845 1853 1859 1890 1894
## [61] 1897 1899 1909 1934 1943 1945 1946 1947 1948 1949 1950 1952 1957 1960 1961
## [76] 1966 1990 1994 1999 2046 2068 2107 2108 2116 2124 2126 2138 2158 2270 2309
## [91] 2409

Vérification : Répartition des re-test par analyse Genemapper

## 
##   2013   2015   2017   2020   2022 2023_1 2023_2 2023_3   2026   <NA> 
##     44      4     42     12     42     24     22     22     64     65

4.2.3 Mismatch test-retest

4.2.3.1 Etape 1 : congruence des tests-retests

On extrait les individus pour lesquels on a un mismatch test-retest i.e. où au moins un des test-retest ne donne pas le même résultat que les autres.

mismatch_retest_id <- c()  # vector to stock individuals with different test/re-test results
list_error <- list()
mismatch_marker <- c()  # store mismatched marker
mismatch_analysis <- c()  # store analysis that Id did that failed

# FOR LOOP TO EXTRACT MISMATCHED TEST RETEST PER INDIVIDUAL
for(i in id_temp_retest){
  data_temp <- pg_all[pg_all$individual_id == i, ]
  data_temp$haplotype <- paste0(data_temp$marker, "_", data_temp$allele_1, "_", data_temp$allele_2)
  
  # check if at least one mismatch test/re-test is present
  test <- 
    data_temp |> 
    filter(!is.na(allele_1) | !is.na(allele_2)) |> 
    group_by(marker) |> 
    mutate(error = length(unique(haplotype)) != 1) |> 
    arrange(marker) |> 
    as.data.frame() |> 
    filter(error)
  
  # count longest sequence
  nb_na_seq <- 
    data_temp |> 
    group_by(fsa) |> 
    reframe(seq_non_na = n() - sum(is.na(allele_1)))  # longest profil sequence without NAs (i.e. most reliable)

  test <- 
    test |> 
    right_join(y = nb_na_seq, by = "fsa") |> 
    filter(error)
  
  # count number of duplicated test
  haplotype_count <- 
    test |> 
    filter(!is.na(allele_1)) |> 
    dplyr::count(haplotype)
 
  test <- 
    test |> 
    right_join(y = haplotype_count, by = "haplotype") |> 
    filter(error)
  
  colnames(test)[ncol(test)] <- "n_test"
  
  # output
  if(any(test$error)){
    
    # store id
    pb_id <- unique(test$individual_id)
    pb_marker <- unlist(unique(test$marker))
    pb_analysis <-  unlist(unique(test$year_analysis))
    
    mismatch_retest_id <- c(mismatch_retest_id, pb_id)
    mismatch_marker <- c(mismatch_marker, pb_marker)
    mismatch_analysis <- c(mismatch_analysis, pb_analysis)
    
    # store data_temp to manually check results
    list_error[[pb_id]] <- test
    
  }
  }

# CONCATENATE RESULTS
df_error <- do.call(rbind, list_error)
# write.xlsx(df_error, file = "test_retest_error_to_check.xlsx")

Vérification : Nombre d’individu avec mismatch test-retest

Nombre d’individu avec au moins un mismatch test-retest : 15 (16% des individus re-testés).

Leur ID est :

sort(as.numeric(mismatch_retest_id))

##  [1]  137  832 1327 1723 1763 1830 1845 1934 1947 1948 1949 1950 1952 2068 2158

Vérification : Nombre d’observation (individu*marqueur) avec mismatch test-retest

Nombre de lecture avec au moins un mismatch test-retest : 128

Vérification : Répartition des mismatchs test-retest par marqueur

## mismatch_marker
##  BIBL1 BIBL18 BIBL20 BIBL31  BIBL4  MA002  MA018  MA066  MA091   MS41   MS45 
##      4      4      2      4      6      3      3      1      4      4      3 
##   MS47   MS53   MS56    MS6   ST10 
##      4      7      4      5      5

Les mismatchs test-retests se trouvent surtout sur le marqueur ST10 (dans l’analyse 2023 PK5 [2023_3]). Mais ce marqueur a été supprimé pour cette analyse.

Vérification : Bilan erreur

Le bilan des mismatchent test/retest est disponible dans le fichier .xlsx bilan_mismatch_test_retest.xlsx.

4.2.3.2 Etape 2

L’idée est maintenant de déterminer quelle analyse test/retest conserver. On sélectionne la meilleure analyse par individu par marqueur selon les critères suivants (par ordre d’importance)

Qualité de la séquence - Allèles correspondant au profil avec la plus longue séquence sans NAs
Haplotype le plus fréquent - Si séquence sans NA égale, prendre l’haplotype le plus fréquent
Analyse la plus récente - Si toutes les fréquences sont égales, prendre la séquence la plus récente

index <- match(df_error$individual_id, pg_all$individual_id)
df_error$year_analysis <- pg_all$year_analysis[index]
df_error$year_analysis[df_error$year_analysis %in% c("2023_1", "2023_2", "2023_3")] <- 2023  # pour tirer au plus récent

to_remove <- c()

for(ind_temp in unique(df_error$individual_id)){
  marker_pb <- unique(df_error[df_error$individual_id == ind_temp, "marker"])
  
  for(marker_temp in marker_pb){
    df_filtered <- df_error[df_error$individual_id == ind_temp & df_error$marker == marker_temp, ]
    
      # a. Garder séquence(s) la/les plus longue(s)
      df_temp <- df_filtered |> 
        filter(seq_non_na == max(seq_non_na, na.rm = TRUE))
      
      # b. Si plusieurs séquences équivalentes, garder haplotype le plus fréquent
      if(nrow(df_temp) > 1){
        freq_tab <- table(df_temp$haplotype)
        max_freq <- max(freq_tab)
        best_haplos <- names(freq_tab[freq_tab == max_freq])
        
        df_temp <- df_temp |> 
          filter(haplotype %in% best_haplos)
      }
      
      # c. Si encore ex-aequo, garder la plus récente analyse
      if(nrow(df_temp) > 1 & !all(is.na(df_temp$year_analysis))){
        df_temp <- df_temp |> 
          filter(year_analysis == max(year_analysis, na.rm = TRUE))
      }
      
      # Sélection finale : garder le premier (au cas d’égalité stricte)
      df_keep <- df_temp[1, ]
      
      # Marquer tous les autres comme "à supprimer"
      ids_remove <- setdiff(df_filtered$id_file, df_keep$id_file)
      to_remove <- c(to_remove, ids_remove)
  }
}

# On supprime les lignes avec les données problématiques ou redondantes
pg_all <- pg_all[!pg_all$id_file %in% to_remove, ]

Le problème de cette méthode est que on va crée des chimères de profils génétiques issus de plusieurs génotypage (i.e. fichiers .fsa). Je me demande si on ne devrait pas prendre la séquence non-NA la plus longue uniquement. A réfléchir.

4.2.4 Match test-retest

Pour les individus sans mismatch test/retest, on supprime simplement les doublons. En cas de NA pour un marqueur donné, on conserve la donnée non-NA.

for(i in id_temp_retest){  # [!id_temp_retest %in% mismatch_retest_id]
  
  data_temp <- pg_all[pg_all$individual_id %in% i, ]
  id_file_initial <- data_temp$id_file
  
  # on optimise les données gardées i.e. si NAs dans test et pas NAs dans retest, on garde retest
  id_file_to_keep <- 
    data_temp |> 
    group_by(marker) %>%
    # Si le marker a au moins une ligne complète, supprimer les NAs
    filter(
      !(is.na(allele_1) & is.na(allele_2) & any(!is.na(allele_1) | !is.na(allele_2)))
    ) %>%
    filter(!duplicated(marker)) |>   # on enlève les doublons car on sait qu'ils sont identiques (test précédent)
    ungroup() |> 
    dplyr::select(id_file) |> 
    as.data.frame() |> 
    unlist()
  
  # remove the non wanted id file
  id_file_to_remove <- id_file_initial[!id_file_initial %in% id_file_to_keep]
  
  # filter dataframe
  pg_all <- pg_all[!pg_all$id_file %in% id_file_to_remove, ]
}

Même problème que plus haut, cette méthode est que on va crée des chimères de profils génétiques issus de plusieurs génotypage (i.e. fichier .fsa).

Vérification : Après cette méthode, on s’assure qu’on a plus d’individus avec plusieurs données pour un marqueur donné

Nombre d’individu(s) dupliqué(s) : 0

4.3 Non-genotyped individuals

On calcule le nombre d’individu n’ayant pas été génotypés. A noter qu’Aurélie COHAS m’a indiqué que les individus non-génotypés datant d’avant 2015 ne peuvent simplement pas l’être… Leur ADN est trop âbimé. Certains individus ont eu jusqu’à 10 génotypages, qui n’a jamais rien donné.

Nombre d’individu(s) non-génotypé(s) : 104 / 2287 (4.55%)

Vérification : Répartition des non-génotypés dans les années

## 
## 1990 1991 1992 1996 1997 2000 2003 2011 2012 2013 2014 2016 2017 2018 2019 2023 
##   16   11    6    8    2    1   13    1    7    6    1    1    3    1    3   15 
## 2024 2025 
##    3    6

La plupart des individus non-génotypés sont de 1990, 2023, 1991 et 2003.

Vérification : Nombre d’individus parents sociaux non-génotypés

Ces individus correspondraient aux individus prioritaires pour les analyses car ils apporteraient de l’information de parentée sur toutes leurs portées.

Nombre d’individu(s) non-génotypé(s) parent(s) : 6.

Leur ID sont :

## [1]    6   45   55 1605 2220 2280

Vérification : Répartition des parents non-génotypés selon l’année de capture

## 
## 1990 1991 1992 2014 2023 
##    1    1    1    1    2

La plupart sont des parents de 2017… et 3 d’entre eux sont de très vieux individus (1990, 1991 et 1992).

5 FORMATAGE DES DONNEES GENOTYPES POUR CERVUS

On formatte les données comment dans CERVUS pour avoir un tableau avec en ligne les individus, en colonne leur profil génétique au loci donné (i.e. 2 par allèles e.g. BIBL1a / BIBL1b) et des 0 à la place des NA. Ce tableau correspondra au fichier genotype files de l’analyse CERVUS.

5.1 Formatage

# FORMAT DF
pg_cervus <- pg_all |>
  arrange(individual_id, desc(id_file)) |>
  filter(!duplicated(paste0(marker, individual_id))) |>
  
  #formatting for cervus : ID / SEXE / marker_a / marker_b
  pivot_wider(id_cols = c(individual_id, sexe, annee_emergence),
              names_from = marker,
              values_from = c(allele_1, allele_2),
              names_sep = "",
              names_vary = "slowest"
  )

pg_cervus <- as.data.frame(pg_cervus)

# RENAME COL FOR CERVUS
former_colnames <- colnames(pg_cervus)[4:ncol(pg_cervus)] 
suffixe <- rep(x = c("a", "b"), length(former_colnames)/2)
new_names <- paste0(gsub(pattern = "allele_1|allele_2", replacement = "", x = former_colnames), suffixe)
colnames(pg_cervus)[4:ncol(pg_cervus)] <- new_names


# REPLACE NA PER 0 (cf Cervus tutorial)
pg_cervus[4:ncol(pg_cervus)] <- apply(pg_cervus[4:ncol(pg_cervus)], 2, function(x) replace_na(as.numeric(as.character(x)), 0))

# REMOVE BLANK SPACE
pg_cervus[] <- lapply(pg_cervus, function(x) {
  if (is.character(x)) gsub(" ", "", x) else x
})  # pg[] permet de garder la structure en data.frame

# TYPE OF VARIABLES
pg_cervus[4:ncol(pg_cervus)] <- sapply(X = pg_cervus[,4:ncol(pg_cervus)], as.character)
pg_cervus$individual_id <- as.character(pg_cervus$individual_id)  # IMPORTANT

head(pg_cervus)

##   individual_id sexe annee_emergence MA066a MA066b MA018a MA018b MA002a MA002b
## 1           100    m              NA    231    241    296    298    271    279
## 2          1000    f            2006    231    231    298    298    281    281
## 3          1001    f            2006    231    241    298    298    279    281
## 4          1002    f            2006    231    231    298    298    279    281
## 5          1003    m            2006    231    231    298    298    281    281
## 6          1004    f            2006    231    231    298    298    281    281
##   BIBL4a BIBL4b BIBL31a BIBL31b BIBL20a BIBL20b BIBL18a BIBL18b BIBL1a BIBL1b
## 1    175    188     157     159     208     208     137     149     97    101
## 2    190    192     157     157     208     216     143     145     95    101
## 3    188    190     157     159     208     216     143     145    101    109
## 4    188    190     157     159     208     216     145     147    101    109
## 5    188    190     157     159     208     218     143     145    101    107
## 6    188    190     157     159     208     218     145     149    101    107
##   ST10a ST10b MS56a MS56b MS6a MS6b MS53a MS53b MS47a MS47b MS45a MS45b MS41a
## 1   130   134   106   108  158  158   140   140   182   184   109   109   186
## 2   116   118   108   108  158  158   140   142   186   186   107   109   186
## 3   116   132   106   108  158  158   140   140   176   182   109   109   186
## 4   116   118   106   108  158  158   132   142   176   182   109   109   186
## 5   118   130   106   108  158  158   140   140   186   186   107   109   184
## 6   118   130   108   108  158  158   132   140   182   186   109   109   184
##   MS41b MA091a MA091b
## 1   186    167    175
## 2   186    175    175
## 3   186    159    175
## 4   186    159    175
## 5   186    175    175
## 6   184    173    175

Vérification : Présence de NA au lieu de 0 dans les allèles ?

Nombre de NA dans le tableau : 0

Vérification : Respect de l’ordre des allèles ?

Nombre de cas où allèle 1 < allèle 2 : 0

Vérification : Nombre de profils ne contenant que des 0s

Nombre d’individus sans données : 1
Cette observation vide correspond à un individu de 2026 dont l’extraction a loupé. On le supprime.

pg_cervus[apply(pg_cervus[, 4:ncol(pg_cervus)], 1, function(x){all(x == 0)}), ]

##      individual_id sexe annee_emergence MA066a MA066b MA018a MA018b MA002a
## 1316          2225    f            2023      0      0      0      0      0
##      MA002b BIBL4a BIBL4b BIBL31a BIBL31b BIBL20a BIBL20b BIBL18a BIBL18b
## 1316      0      0      0       0       0       0       0       0       0
##      BIBL1a BIBL1b ST10a ST10b MS56a MS56b MS6a MS6b MS53a MS53b MS47a MS47b
## 1316      0      0     0     0     0     0    0    0     0     0     0     0
##      MS45a MS45b MS41a MS41b MA091a MA091b
## 1316     0     0     0     0      0      0

pg_cervus <- pg_cervus[!apply(pg_cervus[, 4:ncol(pg_cervus)], 1, function(x){all(x == 0)}), ]

Vérification : Longueur des séquences non-NA

## Fréquence de la longueur de séquence non-NA
##   16   15   14   13   12   11   10    9    8    7    6    5    3    2 
## 0.54 0.15 0.08 0.11 0.06 0.02 0.01 0.01 0.01 0.00 0.00 0.00 0.00 0.00

88% des profils génétiques ont plus de 13 / 16 marqueurs non-NA.

On considerera un mauvais génotypage si l’individu a moins de 13 marqueurs non-NA.

Nombre d’individu avec moins de 13 marqueurs à comparer : 252 (12%)

Leur ID est :

##   [1]   66  240  373  541  542  553  612  649  819  849  850  860  864  906  910
##  [16]  915 1005 1099 1220 1304 1327 1345 1361 1404 1424 1432 1434 1446 1447 1452
##  [31] 1455 1470 1481 1484 1494 1503 1506 1513 1517 1527 1531 1540 1550 1551 1554
##  [46] 1564 1568 1569 1573 1575 1581 1587 1596 1620 1624 1629 1631 1632 1635 1638
##  [61] 1642 1643 1645 1649 1650 1652 1653 1654 1659 1660 1661 1664 1673 1675 1676
##  [76] 1677 1680 1686 1687 1688 1690 1691 1697 1698 1699 1700 1702 1709 1719 1727
##  [91] 1738 1747 1768 1769 1784 1796 1797 1798 1799 1800 1801 1807 1808 1811 1817
## [106] 1818 1819 1823 1829 1831 1833 1834 1841 1842 1843 1846 1848 1854 1855 1856
## [121] 1857 1862 1869 1872 1877 1878 1884 1898 1903 1907 1922 1936 1938 1941 1949
## [136] 1950 1951 1956 1967 1995 2005 2017 2068 2075 2101 2102 2103 2104 2105 2106
## [151] 2111 2112 2114 2115 2120 2125 2126 2133 2148 2156 2163 2164 2167 2182 2190
## [166] 2198 2199 2200 2202 2206 2207 2208 2209 2210 2211 2212 2224 2227 2228 2229
## [181] 2230 2232 2233 2235 2239 2240 2241 2243 2244 2245 2248 2249 2253 2255 2256
## [196] 2257 2258 2259 2260 2261 2263 2265 2266 2268 2269 2271 2273 2274 2275 2277
## [211] 2278 2281 2282 2285 2286 2289 2291 2297 2301 2302 2304 2305 2306 2320 2321
## [226] 2325 2332 2339 2341 2344 2345 2346 2347 2348 2351 2363 2367 2368 2369 2373
## [241] 2395 2399 2401 2402 2403 2406 2407 2408 2412 2418 2438 2459

Vérification : Informations sur la répartition des mauvais profils

## YEAR OF ANALYSIS
##   2013   2015   2017   2020   2022 2023_1 2023_2 2023_3   2026   <NA> 
##      4     28      8     19     11      2     21     11     94     54

## MARKER (PROPORTION OF NA)
##  BIBL1 BIBL18 BIBL20 BIBL31  BIBL4  MA002  MA018  MA066  MA091   MS41   MS45 
##   0.09   0.03   0.20   0.08   0.06   0.09   0.17   0.15   0.04   0.03   0.02 
##   MS47   MS53   MS56    MS6   ST10   <NA> 
##   0.07   0.03   0.02   0.04   0.13

5.2 Save

# # SAVE 
# dir <- "C:/Users/quittet/Documents/marmot-quantitative-genetics/PEDIGREE_GENETIQUE/CERVUS/"
# write.table(pg_cervus, file = paste0(dir, "ANALYSIS_0_BDD_CERVUS/genotype.txt"), row.names = F, sep = "\t")  # analyse 0
# write.table(pg_cervus, file = paste0(dir, "ANALYSIS_1_BDD_CERVUS/genotype.txt"), row.names = F, sep = "\t")  # analyse 1
# write.table(pg_cervus, file = paste0(dir, "ANALYSIS_2_BDD_CERVUS/genotype.txt"), row.names = F, sep = "\t")  # analyse 2

6 IDENTIFICATION DES INDIVIDUS A TESTER AVEC CERVUS

Les individus sur lesquels réalisés l’analyse de parenté sont les individus répondant aux critères suivants :

Génotypé entre 2016 et 2026
Capturé marmotton (i.e. date d’émergence connu)
Sans mère génétique précisé dans la BDD

#' extract id of all new tested individuals
id_tested_2016_2022 <- unique(pg$individual_id)

# ajout année de capture la plus ancienne
#' ... note : les dates de captures les plus anciennes ont déjà été filtré 
#' ... dans ìndividu
index_temp <- match(pg_cervus$individual_id, individu$individual_id)
pg_cervus$oldest_capture <- individu$annee_capture[index_temp]
pg_cervus <- pg_cervus[c(1,2,3,ncol(pg_cervus), 4:(ncol(pg_cervus) - 1))]  # sort columns

# EXTRAIRE LES ID DES INDIVIDUS A TESTER AVEC CERVUS
#' ... (*) qui ont été génotypé entre 2016 et 2022
#' ... (*) mère sociale connue
#' ... (*) sans parents génétiques dans la BDD
id_pups_to_test <- pg_cervus[pg_cervus$individual_id %in% id_tested_2016_2022 &  # génotypé en 2016 - 2022
                               !is.na(pg_cervus$annee_emergence) &  # date d'émergence connue
                               !pg_cervus$individual_id %in% parentee_genetique$individual_id[!is.na(parentee_genetique$mother)],  # sans mère génétiques dans la BDD
                             "individual_id"]

Nombre d’individu(s) à tester avec CERVUS : 695

Vérification : Nombre de mismatch test-retest parmi les individus à analyser

Nombre d’individu ayant au moins une donnée issu d’un retest : 12.

Leur ID est :

##  [1] 1723 1763 1830 1845 1934 1947 1948 1949 1950 1952 2068 2158

Vérification : Informations sur ces individus re-testés

##      individual_id annee_emergence sexe annee_capture genemapper_analysis
## 800           1723            2016    m          2016                2017
## 844           1763            2016    f          2016                2017
## 918           1830            2017    f          2017              2023_1
## 934           1845              NA    m          2018                2022
## 1033          1934            2019    m          2019                2020
## 1047          1947            2019    m          2019                2022
## 1048          1948            2019    f          2019                2022
## 1049          1949            2019    m          2019                2022
## 1051          1950            2019    f          2019                2022
## 1053          1952            2019    m          2019                2022
## 1182          2068            2020    f          2020              2023_1
## 1282          2158            2021    m          2021              2023_2

Les retests ont surtout eu lieu sur des marmottons en 2022 et 2023.

7 RAW PARENTAGE COMPARISON

L’idée ici est de tester si la mère a toujours au moins un de ses allèles en commun avec le marmotton. On utilise pas CERVUS ici.

7.1 Mother test

On teste ici uniquement avec la mère. On regarde si au moins une allèle de l’offspring par locus est présent chez la mère. L’idée étant qu’on connait l’identité de la mère, qui est la femelle dominante. Donc sauf cas rarissime, on ne devrait pas avoir d’erreur.

Vérification : Tous les individus à tester sont présents dans le pédigrée

Nombre d’individu à tester non-présent dans le pédigrée : 1.

Les ID sont :

## [1] "2077"

Seul l’individu 2077 n’est pas présent. Cela est du au fait que c’est un individu qui a perdu sa bague et qui est 1967 en réalité, après comparaison de son génotype (voir fichier famille). On corrige :

pg_cervus <- pg_cervus[!pg_cervus$individual_id == 1967, ]  # on conserve les data de 1967 qui ont plus de marqueur
pg_cervus$individual_id[pg_cervus$individual_id == 2077] <- "1967"  # on renomme
id_pups_to_test <- id_pups_to_test[!id_pups_to_test == "2077"]  # on supprime 2077 des pups id à tester

7.1.1 Mère sociale des individus

On filtre le pédigrée pour extraire les parents sociaux des individus à tester. On trie le pédigrée selon les critères suivants :

Mère sociale a été génotypée
Individus génotypés entre 2016 et 2022
Mère sociale connue

Vérification : Nombre d’individu avec mère non-génotypée

Nombre d’individu(s) avec mère non-génotypée : 12

Leur ID est :

##  [1] 1709 1816 1817 1818 1831 2048 2049 2050 2051 2060 2454 2455

Vérification : Nombre de mère non-génotypée

Nombre de mère(s) non-génotypée(s) : 2

Leur ID est :

## [1] "2280" "1605"

Vérification : Nombre d’individu avec mère sociale NA

Individus à tester avec mère = NA : 22.

Leur ID est :

##  [1] 1626 1882 1883 1884 1885 1890 1909 1958 1964 1965 1969 2027 2028 2029 2030
## [16] 2139 2140 2146 2147 2165 2415 2435

Vérification : Cohorte des individus capturés marmotton de mère sociale NA

## 
## 2013 2018 2019 2020 2021 2025 <NA> 
##    1    6    4    4    5    2    0

Ces bizareries ont été envoyées à Rebecca pour correction et enquête.

7.1.2 Comparaison des génotypes

On trie le pédigrée selon les critères cités plus haut :

Mère sociale a été génotypée
Individus génotypés entre 2016 et 2026
Mère sociale connue
Individu avec plus de 13 marqueurs non-NA

pedigree_data[grepl("dummy", pedigree_data$mother), "mother"] <- NA

ped_tested <- pedigree_data[!is.na(pedigree_data$mother) &
                              # mère sociale connue!pedigree_data$individual_id %in% id_non_genotyped &
                              !pedigree_data$mother %in% id_non_genotyped &
                              !is.na(pedigree_data$mother) &
                              # mère génotypée
                              pedigree_data$individual_id %in% pg_cervus$individual_id &
                              !pedigree_data$individual_id %in% parentee_genetique$individual_id, ]  # individu dont la parentée génétique n'a pas été testé

Vérification : Nombre d’individu testé effectivement

Nombre d’individu(s) répondant à tous les critères de filtre : 668

7.1.3 Test

La fonction match_genotype() permet de comparer si l’offspring et la mère partagent au moins un allèle à chaque marqueur.

res_match <- apply(X = ped_tested[1:2], MARGIN = 1, FUN = function(x) match_genotype(x)$mother_match)

Associations correctes : 589 / 668 (88%)

7.1.4 Exploration des mismatchs

A noter que l’ensemble des parentées sociales ont été re-vérifiées à partir du fichier famille quand il y avait au moins un mismatch…

# GET MISMATCH INDIVIDUAL INFORMATION
index_temp <- as.numeric(which(!res_match))
index_temp <- match(ped_tested$individual_id, individu$individual_id)  # match cohort
ped_tested["annee_emergence"] <- individu[index_temp, "annee_emergence"]
ped_tested[["match_mother"]] <- res_match
ped_tested["n_loci_compared"] <- apply(X = ped_tested[1:2], MARGIN = 1, FUN = function(x) match_genotype(x)$n) 
ped_tested["n_error"] <- apply(X = ped_tested[1:2], MARGIN = 1, FUN = function(x) match_genotype(x)$nb_mismatch_mother_offspring) 

# LENGTH OF THE SEQ FOR OFFSPRING
length_seq <- apply(pg_cervus[5:ncol(pg_cervus)], 1, function(x) sum(x!=0)/2)
index_temp <- match(ped_tested$individual_id, pg_cervus$individual_id)
ped_tested$length_seq <- length_seq[index_temp]  # offspring sequence
index_temp <- match(ped_tested$mother, pg_cervus$individual_id)
ped_tested$length_seq_mother <- length_seq[index_temp]  # mother sequence

7.1.4.1 Nombre d’erreur et nombre de loci comparé

On calcule ici le nombre de cas où un individu a un mismatch avec sa mère. On précise également le nombre de loci comparable entre les deux (en excluant les NA).

## Nombre de marqueurs avec mismatch
##    0    1    2    3    4 <NA> 
##  589   67   10    1    1    0

## Nombre de loci comparé
##    2    4    5    6    7    8    9   10   11   12   13   14   15   16 <NA> 
##    2    1    3   19    6   15   30   56  103  117  128   84   62   42    0

L’immense majorité des mismatchs est situé seulement sur un marqueur. 5 individus ont 2 mismatchs avec leur mère et un individu a 4 mismatchs.

7.1.4.2 Nombre de mère unique avec mismatch

Nombre de mère unique avec au moins un mismatch avec leur offspring : 20.

Leur ID est :

##  [1] 1134 1182 1292 1298 1315 1408 1432 1433 1443 1475 1520 1560 1564 1569 1586
## [16] 1635 1657 1941 1948 2116

et les erreurs sont réparties comme suit :

table(ped_tested$mother[!ped_tested$match_mother])

## 
## 1134 1182 1292 1298 1315 1408 1432 1433 1443 1475 1520 1560 1564 1569 1586 1635 
##    2    2    1    3    7    3    2    1    1    1    2    6    1    1    4   12 
## 1657 1941 1948 2116 
##   10    8    7    5

La qualité de leur génotypage :

ped_tested[!ped_tested$match_mother, c("individual_id", "length_seq",  "mother", "length_seq_mother", "n_loci_compared"),] |> 
  arrange(mother)

##    individual_id length_seq mother length_seq_mother n_loci_compared
## 1           1758         16   1134                16              16
## 2           1757         16   1134                16              16
## 3           1701         13   1182                16              13
## 4           1698         10   1182                16              10
## 5           2009         14   1292                16              14
## 6           1941         12   1298                15              11
## 7           1934         16   1298                15              15
## 8           1862         11   1298                15              10
## 9           2133         11   1315                15              10
## 10          2072         15   1315                15              14
## 11          2071         15   1315                15              14
## 12          2003         16   1315                15              15
## 13          1836         14   1315                15              13
## 14          1770         16   1315                15              15
## 15          1769         12   1315                15              11
## 16          1833         11   1408                14              10
## 17          1792         15   1408                14              13
## 18          1731         15   1408                14              13
## 19          1834         10   1432                 9               6
## 20          1835         14   1432                 9               8
## 21          1738         11   1433                15              10
## 22          1940         13   1443                15              12
## 23          1933         13   1475                14              11
## 24          2116         13   1520                16              13
## 25          1956         12   1520                16              12
## 26          1918         13   1560                15              12
## 27          1917         15   1560                15              14
## 28          1916         14   1560                15              13
## 29          1856         11   1560                15              10
## 30          1855         11   1560                15              11
## 31          1854         12   1560                15              11
## 32          1948         13   1564                11               9
## 33          2075         12   1569                10               6
## 34          2352         13   1586                14              11
## 35          2350         14   1586                14              12
## 36          2349         13   1586                14              11
## 37          2348         12   1586                14              10
## 38          1827         14   1635                 6               4
## 39          1928         15   1635                 6               6
## 40          1895         15   1635                 6               6
## 41          1931         13   1635                 6               6
## 42          1930         13   1635                 6               6
## 43          1929         15   1635                 6               6
## 44          1894         16   1635                 6               6
## 45          1893         14   1635                 6               5
## 46          1886         13   1635                 6               6
## 47          1826         13   1635                 6               6
## 48          1825         13   1635                 6               6
## 49          1824         13   1635                 6               6
## 50          2421         13   1657                13              12
## 51          2420         13   1657                13              12
## 52          2419         14   1657                13              12
## 53          2363         10   1657                13               9
## 54          2358         13   1657                13              12
## 55          2121         16   1657                13              13
## 56          2064         16   1657                13              13
## 57          2062         16   1657                13              13
## 58          1952         16   1657                13              13
## 59          1947         16   1657                13              13
## 60          2411         14   1941                12              11
## 61          2410         14   1941                12              11
## 62          2402         10   1941                12               9
## 63          2401         12   1941                12              10
## 64          2364         13   1941                12              11
## 65          2334         14   1941                12              11
## 66          2271          9   1941                12               8
## 67          2270         14   1941                12              11
## 68          2416         13   1948                13              11
## 69          2343         13   1948                13              12
## 70          2311         14   1948                13              12
## 71          2310         13   1948                13              11
## 72          2244         10   1948                13               8
## 73          2243         12   1948                13              11
## 74          2242         13   1948                13              12
## 75          2424         14   2116                13              13
## 76          2418         10   2116                13              10
## 77          2417         14   2116                13              13
## 78          2339         10   2116                13               9
## 79          2335         14   2116                13              13

Une partie des mismatchs est due à des individus (offspring ou parents) mal génotypés. La mère 1635 est clairement à re-génotyper (seulement 6 marqueurs non-NA)

7.1.4.3 Répartition des individus ayant au moins un mismatch avec leur mère dans les analyses

## 
##   2015   2017   2020   2022 2023_1 2023_2 2023_3   2026   <NA> 
##      2      6     19      8      5      3      1     35      0

7.1.4.4 Répartition mismatchs dans les années d’analyses

##                      ANALYSIS OF THE MOTHER
## ANALYSIS OF OFFSPRING 2013 2015 2020 2022 2023_1 2023_2 2023_3 <NA>
##                2015      0    0    0    0      0      0      0    2
##                2017      2    0    0    0      0      0      0    4
##                2020      3    8    0    0      0      0      1    7
##                2022      2    2    0    0      1      0      0    3
##                2023_1    2    2    0    0      0      0      0    1
##                2023_2    1    1    0    0      1      0      0    0
##                2023_3    0    1    0    0      0      0      0    0
##                2026      1    8    8    7      0      5      4    2
##                <NA>      0    0    0    0      0      0      0    0

7.1.4.5 Répartition des erreurs parmi les marqueurs

##         Analysis of offspring
##          2015 2017 2020 2022 2023_1 2023_2 2023_3 2026 <NA>
##   BIBL1     0    0    1    0      0      0      0    0    0
##   BIBL18    0    0    0    0      1      0      0    2    0
##   BIBL20    1    0    0    2      0      0      0    0    0
##   BIBL31    1    3    6    4      4      2      0   13    0
##   BIBL4     0    2    0    0      0      0      0    2    0
##   MA002     0    0    1    1      0      0      0    2    0
##   MA066     0    0    3    1      0      1      0   13    0
##   MA091     0    0    0    0      0      0      0    3    0
##   MS41      0    1    8    1      0      0      1    7    0
##   MS47      0    0    0    1      0      0      0    1    0
##   MS53      1    1    0    0      0      0      0    2    0
##   MS56      0    0    0    0      0      0      0    1    0
##   <NA>      0    0    0    0      0      0      0    0    0

##         Analysis of mother
##          2013 2015 2020 2022 2023_1 2023_2 2023_3 <NA>
##   BIBL1     1    0    0    0      0      0      0    0
##   BIBL18    0    0    0    0      0      0      2    1
##   BIBL20    0    0    0    0      1      0      0    2
##   BIBL31    7   10    0    6      0      0      0   10
##   BIBL4     0    0    0    0      0      0      2    2
##   MA002     1    0    0    1      0      0      0    2
##   MA066     2    0    8    0      1      5      1    1
##   MA091     0    0    0    0      0      0      3    0
##   MS41      0   12    0    4      0      0      0    2
##   MS47      0    0    0    0      0      0      0    2
##   MS53      0    0    0    0      0      0      1    3
##   MS56      0    0    0    0      0      0      0    1
##   <NA>      0    0    0    0      0      0      0    0

On construit le tableau d’erreur en complétant les informations (e.g. haplotype du père) et en ajoutant le nombre d’offspring avec qui la mère a des mismatchs afin de déterminer si l’erreur vient du génotypage de la mère ou de l’offspring.

# TOTAL NUMBER OF ERROR BETWEEN MOTHER AND OFFSPRINGS
# on match le nombre d'erreur total avec la mère de l'offspring
index_temp <- match(df_error_mother$individual_id, ped_tested$individual_id)
df_error_mother$n_error_total <- ped_tested$n_error[index_temp]
df_error_mother$cohort <- ped_tested$cohort[index_temp]

# AJOUT DE L'HAPLOTYPE DU PERE
#' ... Certaines erreurs viennent d'un décalage avec la mère. On regarde l'haplotype
#' ... du père pour voir de qui vient le décalage (de moi ou de la bdd)...
index_temp <- match(df_error_mother$individual_id, pedigree_data$individual_id)
df_error_mother$father <- pedigree_data[index_temp, "father"]

# fonction pour ajouter l'haplotype du père au DF
#' ... l'idée est de détecter de qui vient le décalage à l'allèle donné.
get_father_haplo <- function(i){
  father_temp <- df_error_mother[i, c("father")]
  marker_temp <- df_error_mother[i, c("marker")]
  df_temp <- df_error_mother[i, c("marker", "off_haplo", "mother_haplo")]

  haplo_father <- unlist(pg_cervus[pg_cervus$individual_id == father_temp, grepl(paste0("\\b", marker_temp, "\\b"), gsub("a|b", "", colnames(pg_cervus)))])
  haplo_father <- paste0(haplo_father[1], "/", haplo_father[2])

  df_temp$haplo_father <- haplo_father

  if(length(haplo_father) > 0){
    return(df_temp)
  } else{
    return("Father non-génotypé")
  }
}

df_error_mother$father_haplo <- NA
for(i in 1:nrow(df_error_mother)){
  df_error_mother[i, "father_haplo"] <- get_father_haplo(i)[, "haplo_father"]
}

# AJOUT DU NOMBRE DE MIMSTACH TOTAL AVEC LES DESCENDANTS DE TOUTE LA PORTEE
#' A UNE ANNEE DONEEE
ref_litter_error <- 
  ped_tested |> 
  group_by(mother, cohort) |> 
  summarise(n_correct = sum(match_mother), n_incorrect = sum(!match_mother)) |> 
  as.data.frame()

# add to df_error_mother
index <- match(paste0(df_error_mother$mother_id, df_error_mother$cohort), paste0(ref_litter_error$mother, ref_litter_error$cohort))
df_error_mother <- cbind(df_error_mother, ref_litter_error[index, c("n_correct", "n_incorrect")])


# NOMBRE DE MISMATCH ENTRE LA MERE ET L'ENSEMBLE DE SES DESCENDANTS
ref_litter_error <- 
  ped_tested |> 
  group_by(mother) |> 
  summarise(n_correct_tot = sum(match_mother), n_incorrect_tot = sum(!match_mother)) |> 
  as.data.frame()

# add to df_error_mother
index <- match(df_error_mother$mother_id, ref_litter_error$mother)
df_error_mother <- cbind(df_error_mother, ref_litter_error[index, c("n_correct_tot", "n_incorrect_tot")])


# ADD TERRITORY OF BIRTH
index <- match(df_error_mother$individual_id, pheno$individual_id)
df_error_mother$territoire <- pheno$territoire[index]
df_error_mother$annee_emergence <- pheno$annee_emergence[index]

# LENGTH OF COMPARED SEQUENCE
index <- match(df_error_mother$individual_id, ped_tested$individual_id)
df_error_mother <- cbind(df_error_mother, ped_tested[index, c("n_loci_compared", "length_seq", "length_seq_mother")])

# ADD REPRODUCTIVE STATUS
index_temp <- match(paste0(df_error_mother$mother_id, df_error_mother$annee_emergence), paste0(statut_individu$individu_id, statut_individu$annee_capture))
df_error_mother$statut_reproducteur <- statut_individu$statut_reproducteur[index_temp]

# FILTER LES INDIVIDUS DEPUIS 2013 TO 2023
# df_error_mother <- df_error_mother[!is.na(df_error_mother$year_analysis), ]

# WE REMOVE THE CASE ALREADY CORRECTED IN BDD 
#' ... Donc les cas où la parentée sociale est différente de la parentée génétique
index <- match(df_error_mother$individual_id, parentee_genetique$individual_id)
df_error_mother$match_gen <- ifelse(df_error_mother$mother_id == parentee_genetique$mother[index], T, F)

df_error_mother <- 
  df_error_mother |> 
  filter(match_gen | is.na(match_gen))

# PRINT RESULTS
df_error_mother <- 
  df_error_mother |> 
  arrange(mother_id, marker)

# # SAVE
# write.xlsx(
#   df_error_mother,
#   file = paste0(
#     "df_error_mother_",
#     format(Sys.time(), format = "%d_%m_%H_%M%"),
#     ".xlsx"
#   ),
#   row.names = FALSE
# )

7.1.4.6 Correction des erreurs

Pour identifier si le problème vient de l’offspring ou d’une autre raison, je croise plusieurs informations : si la femelle dominante est allaitante, comment le ou les mismatchs sont répartis intra et inter-portée (via la fonction check_mother_siblings_mismatch()), le nombre d’autres femelles dominantes, le cas échéant je teste les génotypes avec ces autres femelles en cas d’erreur etc…Toutes les “enquêtes” sont disponibles dans le fichier .xlsx df_error_mother.

# CHECK LES ERREURS ENTRE PORTEE ET MERES (à faire tourner à la main)
check_mother_siblings_mismatch <- function(mother_id, father_id, list_markers) {
  
  
  # Identifier les enfants du couple
  siblings <- pedigree_data |>
    filter(mother == mother_id & father == father_id) |>
    pull(individual_id)
  
  # Construire les noms des colonnes d’allèles
  alleles_names <- paste0(rep(list_markers, each = 2), c("a", "b"))
  
  # Extraire les génotypes
  df_geno <- pg_cervus[
    pg_cervus$individual_id %in% siblings,
    c("individual_id", alleles_names)
  ]
  
  # Construire une colonne "marker" pour chaque marqueur
  for(i in seq_along(list_markers)){
    idx <- (i - 1) * 2 + 1        # 1,3,5,...
    a1  <- alleles_names[idx]
    a2  <- alleles_names[idx + 1]
    
    df_geno[[list_markers[i]]] <- paste0(df_geno[[a1]], "/", df_geno[[a2]])
  }
  
  return(df_geno[c("individual_id", list_markers)])
}

# GET SIBLINGS GENOTYPE
# check_mother_siblings_mismatch(
#   mother_id = 1315,
#   father_id = 1487,
#   list_markers = c("BIBL31")
# )

Erreur issu du génotypage de l’offspring

# FUNCTION TO CORRECT GENOTYPE EASILY
correction_function <- function(id, marker, new_haplotype){
  # change pg_cervus
  pg_cervus[pg_cervus$individual_id %in% id, grepl(paste0("\\b", marker, "\\b"), x =  gsub("a|b", "", colnames(pg_cervus)))] <<- new_haplotype
}

##############################################################################
#                                                                            #
#                                  ERREURS                                   #
#                                                                            #
##############################################################################
# Erreur offspring // mère 204
correction_function(id = c("811"), marker = "BIBL20", new_haplotype = "0")

# La mère 533 est inconnue (déjà corrigée dans pédigrée génétique)
#' ... On les enlève du jeu de données
id_to_remove <- ped_tested[ped_tested$mother == 533, "individual_id"]
pg_cervus <- pg_cervus[!pg_cervus$individual_id %in% id_to_remove, ]
ped_tested <- ped_tested[!ped_tested$mother == 533, ]


# Erreur offspring // mère 636
correction_function(id = c("1567"), marker = "MS53", new_haplotype = "0")

# Erreur offspring // mère 942
correction_function(id = c("1330", "1329", "1396", "1501", "1603", "1602", "1601"), marker = "BIBL31", new_haplotype = "0")

# Erreur pas claire // mère 988
correction_function(id = c("1629"), marker = "MS45", new_haplotype = "0")

# Erreur offspring // mère 1050
correction_function(id = c("1294", "1386", "1364"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1087
correction_function(id = c("1315" ,"1300" ,"1314" ,"1385" ,"1362" ,"1456" ,"1451" ,"1509", "1836"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1112 : l'offsrping 1335 est le seul de sa portée avec erreurs (n = 4), sûrement pas le bon individu génotypé... Je mets "NA" pour tout son génotype
pg_cervus <- pg_cervus[!pg_cervus$individual_id == 1335, ]
ped_tested <- ped_tested[!ped_tested$individual_id == 1335, ]


# Erreur offspring // mère 1120
correction_function(id = c("1381", "1383", "1384"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1134
correction_function(id = c("1656"), marker = "BIBL18", new_haplotype = "0")
correction_function(id = c("1656"), marker = "MA066", new_haplotype = "0")

# Erreur offspring // mère 1182 2012 à 2014
correction_function(id = c("1368", "1369", "1495", "1560"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1182 2015, bizarre, mais voir fichier.xlsx df_error_mother
correction_function(id = c("1698"), marker = "MS53", new_haplotype = "0")
correction_function(id = c("1701"), marker = "BIBL20", new_haplotype = "0")
correction_function(id = c("1701"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1199, L'allèle est 186/186 (car 1635 est aussi mère et a le même mismatch avec toute ses portées)
correction_function(id = c("1635"), marker = "MS41", new_haplotype = "186")

# Erreur offspring // mère 1231
correction_function(id = c("1517"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1292
correction_function(id = c("1641"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1297
correction_function(id = c("1536", "1533", "1532"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1298
correction_function(id = c("1862"), marker = "MA002", new_haplotype = "0")
correction_function(id = c("1934"), marker = "BIBL1", new_haplotype = "0")

# Erreur offspring // mère 1321
correction_function(id = c("1679"), marker = "BIBL20", new_haplotype = "0")

# Erreur offspring // mère 1360
correction_function(id = c("1669"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1408
correction_function(id = c("1691"), marker = "BIBL31", new_haplotype = "0")
correction_function(id = c("1731", "1792"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1941 : venant de la mère
correction_function(id = c("1941"), marker = "MA066", new_haplotype = c("231", "231"))

# Erreur offspring // mère 2116 : venant de la mère
correction_function(id = c("2116"), marker = "MA066", new_haplotype = c("231", "231"))

# Erreur offspring // mère 1292 
correction_function(id = c("2009"), marker = "MA066", new_haplotype = c("231", "231"))

# Erreur offspring // mère 1408 
correction_function(id = c("1833"), marker = "BIBL31", new_haplotype = c("0"))
correction_function(id = c("1833"), marker = "MA002", new_haplotype = c("0"))

# Erreur offspring // mère 1432
correction_function(id = c("1834"), marker = "MS47", new_haplotype = c("0"))
correction_function(id = c("1834"), marker = "MS53", new_haplotype = c("0"))
correction_function(id = c("1834"), marker = "MS56", new_haplotype = c("0"))

# Erreur offspring // mère 1948 : venant de la mère
correction_function(id = c("1948"), marker = "BIBL31", new_haplotype = c("0"))

# Erreur offspring // mère 1443 
correction_function(id = c("1940"), marker = "MA066", new_haplotype = c("0"))
correction_function(id = c("1738"), marker = c("MS41"), new_haplotype = "0")

# Erreur offspring // mère 1475 
correction_function(id = c("1933"), marker = "MA066", new_haplotype = c("231", "231"))

# Erreur offspring // mère 1432
correction_function(id = c("1835"), marker = c("BIBL20"), new_haplotype = "0")
correction_function(id = c("1835"), marker = c("MS47"), new_haplotype = "0")

# Erreur offspring // mère 1437
correction_function(id = c("1580", "1655"), marker = "BIBL31", new_haplotype = "0")

# Erreur offspring // mère 1520
correction_function(id = c("1956"), marker = "BIBL20", new_haplotype = "0")

# Erreur offspring // mère 1564
correction_function(id = c("1948"), marker = "MS41", new_haplotype = "0")
correction_function(id = c("1948"), marker = "MA002", new_haplotype = "0")

# Erreur 2075 : probablement pas le bon individu. Je le supprime complètement.
pg_cervus <- pg_cervus[!pg_cervus$individual_id == 2075, ]
ped_tested <- ped_tested[!ped_tested$individual_id == 2075, ]


# Erreur offspring // mère 1657 : cohorte 2019/2020
correction_function(id = c("1947", "1952", "2062", "2064"), marker = "BIBL31", new_haplotype = "0")


#############################################################################
# PAS CLAIR, DANS LE DOUTE JE METS NAs A TOUT LE MONDE, A VOIR AVEC REB
#' ... Car 1540 comme mère est suggérée, mais je dois savoir dans quel sens ça va car seule une partie de la portée est coloriée dnas le livret famille
# Erreur offspring // mère 1657
correction_function(
  id = c(
    "2121",
    "2125",
    "1657",
    "2421",
    "2420",
    "2419",
    "2363",
    "2358",
    "2121",
    "2064",
    "2062",
    "1952",
    "1947"
  ),
  marker = c("BIBL31"),
  new_haplotype = "0"
)
correction_function(
  id = c("2125", "2360", "2420"),
  marker = "MA066",
  new_haplotype = "0"
)

Correction de l’identité des mères en cas de vrai erreur

# La mère 1586 en 2024 n'est pas la mère, c'est 2118
ped_tested[ped_tested$mother == 1586 & ped_tested$cohort == 2024 & !is.na(ped_tested$mother) & !is.na(ped_tested$cohort), "mother"] <- "2118"

pedigree_data[pedigree_data$mother == 1586 & pedigree_data$cohort == 2024 & !is.na(pedigree_data$mother) & !is.na(pedigree_data$cohort), "mother"] <- "2118"

# En 2016, 1443 est la vraie mère de la portée, et non 1134.
ped_tested[ped_tested$mother == 1134 & ped_tested$cohort == 2016 & !is.na(ped_tested$mother) & !is.na(ped_tested$cohort), "mother"] <- "1443"

pedigree_data[pedigree_data$mother == 1134 & pedigree_data$cohort == 2016 & !is.na(pedigree_data$mother) & !is.na(pedigree_data$cohort), "mother"] <- "1443"

# La mère 1182 en 2015 n'est probablement pas la bonne, je la mets NAs et pour le moment la supprime du pédigrée
ped_tested[ped_tested$mother == 1182 & ped_tested$cohort == 2015 & !is.na(ped_tested$mother) & !is.na(ped_tested$cohort), "mother"] <- NA

pedigree_data[pedigree_data$mother == 1182 & pedigree_data$cohort == 2015 & !is.na(pedigree_data$mother) & !is.na(pedigree_data$cohort), "mother"] <- NA

Vérification : On s’assure qu’il n’y a plus de mismatchs offsprings / mother

res_match_temp <- apply(X = ped_tested[!is.na(ped_tested$mother), 1:2],
                        MARGIN = 1,
                        FUN = function(x) match_genotype(x)$mother_match)

Nombre de mismatch mère / offspring : 0

7.2 Father test

On teste ici les pères sociaux. On va comparer les allèles par locus restant après comparaison avec la mère. C’est à dire e.g. que si l’enfant est 144/155 à un locus donné et que la mère est 144/146, alors on regardera si le père a l’allèle 155 à ce même locus. Si on a au moins un mismatch, on traitera l’individu commep potentiel EPY.

pedigree_data[grepl("dummy", pedigree_data$father), "father"] <- NA

ped_tested <- pedigree_data[!is.na(pedigree_data$mother) &
                              !pedigree_data$father %in% id_non_genotyped &
                              pedigree_data$individual_id %in% pg_cervus$individual_id &
                              !pedigree_data$individual_id %in% parentee_genetique$individual_id, ]  # individu dont la parentée génétique n'a pas été testé

7.2.1 Père social des individus

Vérification : Nombre d’individu avec père social NA

Individus à tester avec père == NA : 11.

Leur ID est :

##  [1] 1763 1764 1765 1766 1800 1801 1810 2213 2214 2215 2453

Vérification : Nombre d’individus avec père social non-génotypé

Nombre d’individus avec pères(s) non-génotypée(s) : 0

Vérification : Nombre de père social non-génotypé

Nombre de pères(s) non-génotypée(s) : 0

Leur ID est :

## character(0)

Nombre d’individu(s) avec père non-génotypée : 0

7.2.2 Comparaison des génotypes

On enlève les individus pour lesquels le père n’a pas été génotypé

ped_tested <- ped_tested[!ped_tested$father %in% id_non_genotyped &
                           ped_tested$individual_id %in% id_pups_to_test, ]

Vérification : Nombre d’individu total testé effectivement

test <- nrow(ped_tested)

Nombre d’individu(s) répondant à tous les critères de filtre : 662

7.2.3 Test

On teste le match entre le profil de le père et des offsprings à l’aide de la fonction match_genotype() disponible dans le code CERVUS_fonctions.r. L’idée est de récupérer les potentiels EPY qui seront l’ensemble des individus ayant au moins un mismatch avec leur père social. Tous ces potentiels EPY seront analysés avec CERVUS.

Ici on doit prendre en compte le fait que on va comparer pour chaque marqueur : (1) le ou les allèles de l’offspring conditionnellement à la comparaison avec la mère avec ceux du père, mais aussi (2) tout en prenant en compte le fait que la première méthode cache une part de variation entre l’offspring et le père, car si la mère a beaucoup de NA dans ses haplotypes et pas le père, alors on perd de l’info. (3), aussi, je teste si la mère est non-génotypée, si l’offspring a 0 mismatch avec le mâle dominant. Pour résoudre ce problème on va aussi comparer pour tous les loci disponibles si au moins un des allèles de l’offspring se trouve chez le père. On réalise ensuite l’union des résultats (e.g. si donne match == TRUE et donne match == FAUX, alors on précisera FAUX).

A noter que je pourrai optimiser la fonction match_genotpye() pour faire les deux d’un coup, mais par gain de temps je ne vais pas le faire et doubler les analyses à chaque fois pour avoir ces deux formes de mismatches..

# Mismatch father/offspring with alleles leftover for comparison after mother/offspring comparison
res_match_mother_conditionnal <- apply(
  X = ped_tested[1:3],
  MARGIN = 1,
  FUN = function(x)
    match_genotype(x)$father_match
)

# Test all father/offspring loci i.e. if at least one locus allele(s) of the offspring is found in father locus 
res_match_father_alone <- apply(
  X = ped_tested[c(1, 3)],
  MARGIN = 1,
  FUN = function(x)
    match_genotype(x)$mother_match
)

# Test si la mère n'est pas génotypée
mother_non_genotyped <- ped_tested$mother %in% id_non_genotyped

# Union of both mismatch
res_match <- (res_match_mother_conditionnal & res_match_father_alone) | (mother_non_genotyped & res_match_father_alone)

Associations correctes : 527 / 662 (80%)

7.2.4 Exploration des mismatchs

A noter que l’ensemble des parentées sociales ont été re-vérifiées à partir du fichier famille quand il y avait au moins un mismatch…

# GET MISMATCH INDIVIDUAL INFORMATION
index_temp <- as.numeric(which(!res_match))
index_temp <- match(ped_tested$individual_id, individu$individual_id)  # match cohort
ped_tested[["match_father"]] <- res_match


# GET NUMBER OF MISMATCH
# On extrait les erreurs liés à la comparaison du génotype conditionnellement à la mère et par comparaison directe offspring / father. On fait ensuite l'union entre les deux.
df_error_mother_conditional <- apply(X = ped_tested[!is.na(ped_tested$father), 1:3], MARGIN = 1, FUN = function(x) match_genotype(x)$df_error)

df_error_father_only <- apply(X = ped_tested[!is.na(ped_tested$father), c(1,3)], MARGIN = 1, FUN = function(x) match_genotype(x)$df_error)

# Formatage
df_error_mother_conditional <- do.call(rbind, df_error_mother_conditional)
df_error_father_only <- do.call(rbind, df_error_father_only)
colnames(df_error_father_only)[colnames(df_error_father_only) %in% c("mother_id", "mother_haplo")] <- c("father_id", "father_haplo")
df_error_father <- rbind(df_error_mother_conditional, df_error_father_only)
df_error_father <- df_error_father[!duplicated(paste0(df_error_father$individual_id, df_error_father$marker, df_error_father$father_id)), ]  # on elève les intersections entre les deux "types" de mismatchs
rownames(df_error_father) <- 1:nrow(df_error_father)

7.2.4.1 Nombre d’erreur et nombre de loci comparé

On calcule ici le nombre de cas où un individu a un mismatch avec son père On précise également le nombre de loci comparable entre les deux (en excluant les NA).

## Nombre de marqueurs avec mismatch
##    0    1    2    3    4    5    6    7    9 <NA> 
##  531   85   25    7    3    8    1    1    1    0

## Nombre de loci comparé
##    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 <NA> 
##    1    1    2    2   19    9   19   39   49  108  109  120   82   58   44    0

Comme attendu, on observe beaucoup plus de mismatch entre les pères et les offsprings qu’entre les mères et les offsprings.

7.2.4.2 Nombre de père unique avec mismatch

Nombre de père unique avec au moins un mismatch avec leur offspring : 37.

Leur ID est :

##  [1] 1091 1099 1258 1309 1326 1332 1342 1404 1409 1417 1439 1440 1487 1500 1526
## [16] 1561 1618 1622 1729 1730 1791 1845 1850 1859 1867 1895 1934 1945 1949 2038
## [31] 2050 2079 2153 2167 2171 2205

7.2.4.3 Répartition des individus ayant au moins un mismatch avec leur père dans les analyses

## 
##   2017   2020   2022 2023_1 2023_2 2023_3   2026 
##     13     25     14     10      7      9     57

7.2.4.4 Répartition des individus potentiellement EPY dans les anciens individus avec un mismatch avec la mère

bool <- unique(df_error_father$individual_id) %in% df_error_mother$individual_id

Nombre d’individus potentiellement EPY ayant eu un mismatch avec leur mère avant correction : 36

7.2.4.5 Répartition des pères avec au moins un mismatch avec leur offspring les années d’analyses

##                      ANALYSIS OF THE FATHER
## ANALYSIS OF OFFSPRING 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##                2017      2    0    0    0    0      0      0      0   11
##                2020      0    0    0    4    0      0      0      6   15
##                2022      3    1    0    0    1      0      0      4    5
##                2023_1    0    0    4    3    0      0      0      0    3
##                2023_2    0    0    0    3    0      0      0      0    4
##                2023_3    0    0    0    4    0      0      0      1    4
##                2026      2    3    4   11    8      7      4      5   13
##                <NA>      0    0    0    0    0      0      0      0    0

7.2.4.6 Répartition des erreurs parmi les marqueurs

## 
##  BIBL1 BIBL18 BIBL20 BIBL31  BIBL4  MA002  MA018  MA066  MA091   MS41   MS45 
##      5     19      5     17     10     25      3      9     14     20     30 
##   MS47   MS53   MS56    MS6   ST10 
##     22     11     21      5     21

##         Analysis of offspring
##          2017 2020 2022 2023_1 2023_2 2023_3 2026 <NA>
##   BIBL1     1    0    1      0      0      0    3    0
##   BIBL18    3    1    1      4      0      1    9    0
##   BIBL20    1    1    3      0      0      0    0    0
##   BIBL31    3    0    4      1      0      0    9    0
##   BIBL4     1    0    0      4      0      1    4    0
##   MA002     5    5    1      4      2      1    7    0
##   MA018     0    0    0      3      0      0    0    0
##   MA066     0    5    1      2      0      1    0    0
##   MA091     0    2    3      2      1      0    6    0
##   MS41      3    1    3      2      3      1    7    0
##   MS45      1   12    2      4      0      1   10    0
##   MS47      0    2    3      3      1      1   12    0
##   MS53      1    0    4      3      0      0    3    0
##   MS56      2    3    0      4      1      3    8    0
##   MS6       1    0    0      0      0      0    4    0
##   ST10      0    3    0      5      1      0   12    0
##   <NA>      0    0    0      0      0      0    0    0

##         Analysis of mother
##          2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##   BIBL1     0    2    0    0    1      1      0      0    1
##   BIBL18    0    0    2    8    1      2      0      1    5
##   BIBL20    0    0    0    0    1      0      0      1    3
##   BIBL31    5    3    2    0    4      0      0      0    3
##   BIBL4     0    1    5    1    1      0      1      0    1
##   MA002     1    0    5    6    2      1      2      0    8
##   MA018     0    0    3    0    0      0      0      0    0
##   MA066     0    0    3    0    0      0      0      3    3
##   MA091     0    2    2    3    1      0      1      2    3
##   MS41      0    2    4    0    2      0      0      0   12
##   MS45      1    1    4    4    0      0      2      9    9
##   MS47      0    3    0    0    0      3      0      1   15
##   MS53      0    2    2    1    1      0      0      3    2
##   MS56      0    0    4    9    0      1      0      2    5
##   MS6       0    1    0    0    2      0      1      0    1
##   ST10      1    0    5    5    5      3      1      0    1
##   <NA>      0    0    0    0    0      0      0      0    0

## , , NA = BIBL1
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    1
##    2020      0    0    0    0    0      0      0      0    0
##    2022      0    1    0    0    0      0      0      0    0
##    2023_1    0    0    0    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    1    0    0    1      1      0      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = BIBL18
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    3
##    2020      0    0    0    0    0      0      0      0    1
##    2022      0    0    0    0    0      0      0      0    1
##    2023_1    0    0    2    2    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    1    0      0      0      0    0
##    2026      0    0    0    5    1      2      0      1    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = BIBL20
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    1
##    2020      0    0    0    0    0      0      0      0    1
##    2022      0    0    0    0    1      0      0      1    1
##    2023_1    0    0    0    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    0    0    0    0      0      0      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = BIBL31
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      1    0    0    0    0      0      0      0    2
##    2020      0    0    0    0    0      0      0      0    0
##    2022      3    0    0    0    0      0      0      0    1
##    2023_1    0    0    1    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      1    3    1    0    4      0      0      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = BIBL4
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    1
##    2020      0    0    0    0    0      0      0      0    0
##    2022      0    0    0    0    0      0      0      0    0
##    2023_1    0    0    4    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    1    0    0      0      0      0    0
##    2026      0    1    0    1    1      0      1      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MA002
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    5
##    2020      0    0    0    2    0      0      0      0    3
##    2022      1    0    0    0    0      0      0      0    0
##    2023_1    0    0    2    2    0      0      0      0    0
##    2023_2    0    0    0    2    0      0      0      0    0
##    2023_3    0    0    1    0    0      0      0      0    0
##    2026      0    0    2    0    2      1      2      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MA018
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    0
##    2020      0    0    0    0    0      0      0      0    0
##    2022      0    0    0    0    0      0      0      0    0
##    2023_1    0    0    3    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    0    0    0    0      0      0      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MA066
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    0
##    2020      0    0    0    0    0      0      0      2    3
##    2022      0    0    0    0    0      0      0      1    0
##    2023_1    0    0    2    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    1    0    0      0      0      0    0
##    2026      0    0    0    0    0      0      0      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MA091
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    0
##    2020      0    0    0    1    0      0      0      0    1
##    2022      0    1    0    0    0      0      0      1    1
##    2023_1    0    0    2    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    1
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    1    0    2    1      0      1      1    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MS41
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    3
##    2020      0    0    0    0    0      0      0      0    1
##    2022      0    1    0    0    0      0      0      0    2
##    2023_1    0    0    2    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    3
##    2023_3    0    0    1    0    0      0      0      0    0
##    2026      0    1    1    0    2      0      0      0    3
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MS45
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      1    0    0    0    0      0      0      0    0
##    2020      0    0    0    0    0      0      0      5    7
##    2022      0    0    0    0    0      0      0      0    2
##    2023_1    0    0    4    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      1    0
##    2026      0    1    0    4    0      0      2      3    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MS47
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    0
##    2020      0    0    0    0    0      0      0      0    2
##    2022      0    1    0    0    0      0      0      1    1
##    2023_1    0    0    0    0    0      0      0      0    3
##    2023_2    0    0    0    0    0      0      0      0    1
##    2023_3    0    0    0    0    0      0      0      0    1
##    2026      0    2    0    0    0      3      0      0    7
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MS53
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    1
##    2020      0    0    0    0    0      0      0      0    0
##    2022      0    1    0    0    0      0      0      2    1
##    2023_1    0    0    2    1    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    1    0    0    1      0      0      1    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MS56
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    2
##    2020      0    0    0    0    0      0      0      1    2
##    2022      0    0    0    0    0      0      0      0    0
##    2023_1    0    0    4    0    0      0      0      0    0
##    2023_2    0    0    0    1    0      0      0      0    0
##    2023_3    0    0    0    3    0      0      0      0    0
##    2026      0    0    0    5    0      1      0      1    1
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = MS6
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    1
##    2020      0    0    0    0    0      0      0      0    0
##    2022      0    0    0    0    0      0      0      0    0
##    2023_1    0    0    0    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    1    0    0    2      0      1      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = ST10
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    0
##    2020      0    0    0    2    0      0      0      0    1
##    2022      0    0    0    0    0      0      0      0    0
##    2023_1    0    0    3    2    0      0      0      0    0
##    2023_2    0    0    0    1    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      1    0    2    0    5      3      1      0    0
##    <NA>      0    0    0    0    0      0      0      0    0
## 
## , , NA = NA
## 
##          Father
## Offspring 2013 2015 2017 2020 2022 2023_1 2023_2 2023_3 <NA>
##    2017      0    0    0    0    0      0      0      0    0
##    2020      0    0    0    0    0      0      0      0    0
##    2022      0    0    0    0    0      0      0      0    0
##    2023_1    0    0    0    0    0      0      0      0    0
##    2023_2    0    0    0    0    0      0      0      0    0
##    2023_3    0    0    0    0    0      0      0      0    0
##    2026      0    0    0    0    0      0      0      0    0
##    <NA>      0    0    0    0    0      0      0      0    0

7.2.5 Check des mismatchs…

# TOTAL NUMBER OF ERROR BETWEEN FATHER AND OFFSPRINGS
# on match le nombre d'erreur total avec la mère de l'offspring
index_temp <- match(df_error_father$individual_id, ped_tested$individual_id)
df_error_father$n_error_total <- ped_tested$n_error[index_temp]
df_error_father$cohort <- ped_tested$cohort[index_temp]

# AJOUT DE L'HAPLOTYPE DU PERE
#' ... Certaines erreurs viennent d'un décalage avec la mère On regarde l'haplotype
#' ... de la mère pour voir de qui vient le décalage (de moi ou de la bdd)...
index_temp <- match(df_error_father$individual_id, pedigree_data$individual_id)
df_error_father$mother <- pedigree_data[index_temp, "mother"]

# fonction pour ajouter l'haplotype du père au DF
#' ... l'idée est de détecter de qui vient le décalage à l'allèle donné.
get_father_haplo <- function(i){
  father_temp <- df_error_father[i, c("father_id")]
  marker_temp <- df_error_father[i, c("marker")]
  df_temp <- df_error_father[i, c("marker", "off_haplo", "father_haplo")]

  haplo_father <- unlist(pg_cervus[pg_cervus$individual_id == father_temp, grepl(paste0("\\b", marker_temp, "\\b"), gsub("a|b", "", colnames(pg_cervus)))])
  haplo_father <- paste0(haplo_father[1], "/", haplo_father[2])

  df_temp$haplo_father <- haplo_father

  if(length(haplo_father) > 0){
    return(df_temp)
  } else{
    return("Father non-génotypée")
  }
}

df_error_father$father_haplo <- NA
for(i in 1:nrow(df_error_father)){
  df_error_father[i, "father_haplo"] <- get_father_haplo(i)[, "haplo_father"]
}

# AJOUT DU NOMBRE DE MIMSTACH TOTAL AVEC LES DESCENDANTS DE TOUTE LA PORTEE
#' A UNE ANNEE DONEEE
ref_litter_error <- 
  ped_tested |> 
  group_by(father, cohort) |> 
  summarise(n_correct = sum(match_father), n_incorrect = sum(!match_father)) |> 
  as.data.frame()

# add to df_error_father
index <- match(paste0(df_error_father$mother_id, df_error_father$cohort), paste0(ref_litter_error$mother, ref_litter_error$cohort))
df_error_father <- cbind(df_error_father, ref_litter_error[index, c("n_correct", "n_incorrect")])


# NOMBRE DE MISMATCH ENTRE LE PERE ET L'ENSEMBLE DE SES DESCENDANTS
ref_litter_error <- 
  ped_tested |> 
  group_by(father) |> 
  summarise(n_correct_tot = sum(match_father), n_incorrect_tot = sum(!match_father)) |> 
  as.data.frame()

# add to df_error_father
index <- match(df_error_father$father_id, ref_litter_error$father)
df_error_father <- cbind(df_error_father, ref_litter_error[index, c("n_correct_tot", "n_incorrect_tot")])


# ADD TERRITORY OF BIRTH
index <- match(df_error_father$individual_id, pheno$individual_id)
df_error_father$territoire <- pheno$territoire[index]
df_error_father$annee_emergence <- pheno$annee_emergence[index]

# LENGTH OF COMPARED SEQUENCE
index <- match(df_error_father$individual_id, ped_tested$individual_id)
df_error_father <- cbind(df_error_father, ped_tested[index, c("n_loci_compared", "length_seq", "length_seq_father")])

# ADD REPRODUCTIVE STATUS
index_temp <- match(paste0(df_error_father$father_id, df_error_father$annee_emergence), paste0(statut_individu$individu_id, statut_individu$annee_capture))
df_error_father$statut_reproducteur <- statut_individu$statut_reproducteur[index_temp]

# FILTER LES INDIVIDUS DEPUIS 2013 TO 2023
# df_error_father <- df_error_father[!is.na(df_error_father$year_analysis), ]

# WE REMOVE THE CASE ALREADY CORRECTED IN BDD 
#' ... Donc les cas où la parentée sociale est différente de la parentée génétique
index <- match(df_error_father$individual_id, parentee_genetique$individual_id)
df_error_father$match_gen <- ifelse(df_error_father$father_id == parentee_genetique$father[index], T, F)

df_error_father <- 
  df_error_father |> 
  filter(match_gen | is.na(match_gen))

# PRINT RESULTS
df_error_father <- 
  df_error_father |> 
  arrange(father_id, individual_id, cohort, marker)

# # SAVE
# write.xlsx(
#   df_error_father,
#   file = paste0(
#     "df_error_father_",
#     format(Sys.time(), format = "%d_%m_%H_%M%_%S_"),
#     ".xlsx"
#   ),
#   row.names = FALSE
# )

L’ensemble des mismatchs offsprings / pères est disponible dans le fichier df_error_father.xlsx

Ces mismatchs sont répartis entre 39 pères.

Leur ID est :

##  [1] 1091 1099 1258 1309 1326 1332 1342 1404 1409 1417 1439 1440 1487 1500 1504
## [16] 1526 1561 1618 1622 1729 1730 1768 1791 1845 1850 1859 1867 1895 1934 1945
## [31] 1949 1996 2038 2050 2079 2153 2167 2171 2205

8 ANALYSE CERVUS

8.1 Genotype file

On sauvegarde pg_cervus en format .txt (tabulation) pour Cervus.

# CONVERSION DES ALLELES EN NOMBRE
df_temp <- pg_cervus[, c(1:2, 5:ncol(pg_cervus))]
df_temp[3:ncol(df_temp)] <- sapply(df_temp[3:ncol(df_temp)], as.numeric)

# SAVE
write.table(df_temp, file = paste0(dir, "/CERVUS ANALYSIS/genotype.txt"), sep = "\t", row.names = FALSE)
save(pg_cervus, file = paste0(dir, "/CERVUS ANALYSIS/pg_cervus.rds"))

8.2 Offspring file

Ce fichier doit contenir l’ID de l’individu à tester, la mère (qu’on traitera comme connue) et \(x\) colonnes contenant les ID des pères candidats.

On a déjà les individus à tester dans CERVUS dans l’objet ped_tested contenant les ID individuelles et les parents sociaux des individus génotypés entre 2016 et 2025, de parents génétiques non-disponibles dans la BDD, de date d’émergence connue et de mère sociale connue et génotypée.

Nombre d’individu testé : 662

On va constuire 4 offsprings files :

Un premier pour tester la validité de la méthode avec une uniquement la femelle dominante comme candidate
Un second avec la mère connue et le mâle dominant comme unique candidat
Un troisième avec la mère connue et l’ensemble des mâles sexuellement matures à \(t\) et \(t-1\) comme pères candidats
Un quatrième avec uniquement les offsprings avec au moins un mismatch avec le mâle dominant et l’ensemble des mâles sexuellement matures à \(t\) et \(t-1\) comme pères candidats

8.2.1 Mère candidat

df_temp <- ped_tested[c("individual_id", "mother")]
df_temp$father <- NA

write.table(df_temp, file = paste0(dir, "/CERVUS ANALYSIS/MOTHER ONLY/offspring_mere.txt"), sep = "\t", row.names = FALSE)

8.2.2 Père seul candidat

df_temp <- ped_tested[c("individual_id", "mother", "father")]

write.table(df_temp, file = paste0(dir, "/CERVUS ANALYSIS/MALE DOMINANT ONLY/offspring_pere_seul.txt"), sep = "\t", row.names = FALSE)

8.2.3 Mâles sexuellement matures candidats

On va considérer comme candidat tous les mâles sexuellement matures présent entre \(t-1\) et \(t+1\), \(t\) étant la naissance du marmotton. On exclute d’office les individus mâles ayant été noté comme “non-reproducteur”. On ajoute aussi systématiquement le mâle dominant comme candidat.

# LOAD INDIVIDU STATUT SOCIAL
# ... avec les séquences reconstruites, code dispo dans le script infanticide.rmd
load("~/marmot-quantitative-genetics/1_DATA_ANALYSIS/3_Data/statut_dominance.rds")  # présence
load("~/marmot-quantitative-genetics/1_DATA_ANALYSIS/3_Data/individu_bdd.rds")  # informations individuelles
load("~/marmot-quantitative-genetics/PEDIGREE_GENETIQUE/CONSTRUCTION_PROFIL_GENETIQUE/CERVUS ANALYSIS/statut_individu.rds")  # statut reproducteur


# ADD REPRODUCTIVE STATUS TO PRESENCE DATA
statut_dominance$key <- paste0(statut_dominance$individual_id, "_", statut_dominance$annee)
statut_individu$key <- paste0(statut_individu$individu_id, "_", statut_individu$annee_capture)

statut_dominance <- 
  left_join(statut_dominance,
          statut_individu[c("key", "statut_reproducteur")] |> filter(!is.na(statut_reproducteur)),
          by = "key", multiple = "first")



# GET AGE (TO SELECT ADULTS CANDIDATES ONLY)
# ... cohort
statut_dominance <- 
  left_join(statut_dominance, pedigree_data[c("individual_id", "cohort", "mother", "father")], by = "individual_id")

statut_dominance <- 
  left_join(statut_dominance, individu_bdd[c("individu_id", "age_obs")],
          by = join_by(individual_id == individu_id))

# ... cohort or first capture
statut_dominance <- 
  statut_dominance |> 
  group_by(individual_id) |> 
  mutate(
    annee_emergence = coalesce(cohort, min(annee))
  ) |> 
  ungroup() |> 
  as.data.frame()

# ... compute age
statut_dominance$age <- 
  statut_dominance$age_obs + 
  statut_dominance$annee - 
  statut_dominance$annee_emergence

# SAVE
save(statut_dominance, file = paste0(dir, "/CERVUS ANALYSIS/statut_dominance_cervus.rds"))


# ID TO ANALYSE IN CERVUS
id_to_analyse <- ped_tested$individual_id

# CONSTRUCT OFFSPRING DATAFRAME BASED 
offspring_cervus_t <- c()  
t_levels <- c()

# we bind the offspring file with associated candidate father per year
for(t in c(2015:2025)){
  
  # OFFSPRING OF YEAR t TO BE TESTED
 offspring <- 
   statut_dominance |> 
    filter(cohort == t) |> 
    filter(!duplicated(individual_id)) |> 
    select(individual_id, mother, cohort)
  # offspring <- statut_dominance[statut_dominance$cohort == t &
  #                                    !duplicated(statut_dominance$individual_id), c("individual_id", "mother", "cohort")] 
  
  
  offspring <- 
    offspring |> 
    filter(individual_id %in% id_to_analyse)
  
  if(nrow(offspring) == 0) next
  
  # SELECT CANDIDATE FATHERS
  candidate_fathers <- 
    statut_dominance |>
    filter(
      sexe == "m" &  # male individual
        (is.na(statut_reproducteur) | (statut_reproducteur != "non_reproducteur")) &
        (age >= 2) &  # adult à t
        (annee %in% (c(t-1, t, t+1)))  # recapturé après t
    ) |> # captured / observed recently
  distinct(individual_id) |>
  pull(individual_id)
  
  cat(paste0("\nNb of candidate fathers for year ", t, " : ", length(candidate_fathers)))
  
  
  # OFFSPRING FILE BIND WITH CANDIDATE FATHERS
  if(length(offspring) > 0){
    
    offspring_cervus_t.temp <- cbind(offspring, t(candidate_fathers))
    
    # ADD DOMINANT MALE IF NOT PRESENT IN CANDIDATE
    id_offspring <- offspring_cervus_t.temp$individual_id
    index <- match(id_offspring, pedigree_data$individual_id)
    soc_father <- pedigree_data[index, "father"]
    
    col_temp <- c()
    for(i in 1:length(id_offspring)){
      if(!soc_father[i] %in% offspring_cervus_t.temp[i, 4:ncol(offspring_cervus_t.temp)]){
        col_temp <- c(col_temp, soc_father[i])
      } else{
        col_temp <- c(col_temp, NA)
      }
    }
    
    if(!all(is.na(col_temp))){
      offspring_cervus_t.temp <- cbind(offspring_cervus_t.temp, col_temp)
    }
    
    offspring_cervus_t <- bind_rows(offspring_cervus_t, offspring_cervus_t.temp)
    t_levels <- c(t_levels, offspring_cervus_t.temp$cohort)
  }
}

## 
## Nb of candidate fathers for year 2015 : 65
## Nb of candidate fathers for year 2016 : 72
## Nb of candidate fathers for year 2017 : 69
## Nb of candidate fathers for year 2018 : 59
## Nb of candidate fathers for year 2019 : 45
## Nb of candidate fathers for year 2020 : 63
## Nb of candidate fathers for year 2021 : 68
## Nb of candidate fathers for year 2022 : 80
## Nb of candidate fathers for year 2023 : 64
## Nb of candidate fathers for year 2024 : 73
## Nb of candidate fathers for year 2025 : 57

# SAVE
write.table(offspring_cervus_t, file = paste0(dir, "/CERVUS ANALYSIS/MALES AS CANDIDATES/offspring_male_candidates.txt"), sep = "\t", row.names = FALSE)

8.2.4 EPY potentiels et mâles sexuellement matures candidats

On extrait les individus ayant au moins un mismatch avec le mâle social comme potentiel EPY. Ce sont ces indivius qu’on va tester dans CERVUS.

# FILTER OFFSPRING FILES TO KEEP ONLY INDIVIDUALS WITH AT LEAST ONE MISMATCH WITH FATHER
id_potential_epy <- ped_tested$individual_id[!ped_tested$match_father]
offspring_cervus_t_epy <- offspring_cervus_t[offspring_cervus_t$individual_id %in% id_potential_epy, ]

# SAVE
write.table(offspring_cervus_t_epy, file = paste0(dir, "/CERVUS ANALYSIS/EPY/offspring_epy.txt"), sep = "\t", row.names = FALSE)

Analyse des profils génétiques 2016 à 2025

Pierre-Alexandre QUITTET

2025-10-03

1 SESSION INFORMATION

2 PACKAGE, DATA AND FUNCTIONS

3 DATA IMPORTATION

3.1 Table “parentee_genetique”

3.2 Table “analyse_microsat”

3.3 Profils génétiques lus sur Genemapper

3.4 Table “individu”

3.5 Table “statut”

4 DATA CLEANING AND FORMATTING

4.1 Formattage

4.1.1 RECODE DES HOMOZYGOTES

4.1.2 FORMATAGE DU NOM

4.1.3 ALLELES DE REFERENCE

4.1.4 SUPPRESSION MARQUEUR MA001

4.1.5 VERIFICATIONS FINALES

4.1.6 Hauteur de pics et correspondance de la taille des allèles

4.1.6.1 Longueur attendue et observée

4.1.6.1.1 Mes lectures

4.1.6.1.2 BDD

4.2 Genotype re-test

4.2.1 Merge des données BDD et des données Genemapper

4.2.2 Individus re-testés de puis le début de suivi

4.2.3 Mismatch test-retest

4.2.3.1 Etape 1 : congruence des tests-retests

4.2.3.2 Etape 2

4.2.4 Match test-retest

4.3 Non-genotyped individuals

5 FORMATAGE DES DONNEES GENOTYPES POUR CERVUS

5.1 Formatage

5.2 Save

6 IDENTIFICATION DES INDIVIDUS A TESTER AVEC CERVUS

7 RAW PARENTAGE COMPARISON

7.1 Mother test

7.1.1 Mère sociale des individus

7.1.2 Comparaison des génotypes

7.1.3 Test

7.1.4 Exploration des mismatchs

7.1.4.1 Nombre d’erreur et nombre de loci comparé

7.1.4.2 Nombre de mère unique avec mismatch

7.1.4.3 Répartition des individus ayant au moins un mismatch avec leur mère dans les analyses

7.1.4.4 Répartition mismatchs dans les années d’analyses

7.1.4.5 Répartition des erreurs parmi les marqueurs

7.1.4.6 Correction des erreurs

7.2 Father test

7.2.1 Père social des individus

7.2.2 Comparaison des génotypes

7.2.3 Test

7.2.4 Exploration des mismatchs

7.2.4.1 Nombre d’erreur et nombre de loci comparé

7.2.4.2 Nombre de père unique avec mismatch

7.2.4.3 Répartition des individus ayant au moins un mismatch avec leur père dans les analyses

7.2.4.4 Répartition des individus potentiellement EPY dans les anciens individus avec un mismatch avec la mère

7.2.4.5 Répartition des pères avec au moins un mismatch avec leur offspring les années d’analyses

7.2.4.6 Répartition des erreurs parmi les marqueurs

7.2.5 Check des mismatchs…

8 ANALYSE CERVUS

8.1 Genotype file

8.2 Offspring file

8.2.1 Mère candidat

8.2.2 Père seul candidat

8.2.3 Mâles sexuellement matures candidats

8.2.4 EPY potentiels et mâles sexuellement matures candidats