I look at the called genotypes for each sample and evaluate the extent to which called genotypes are consistent with the strain the sample is claimed to have come from.

Read in data and standardize snp names

library(data.table)
library(stringr)

pre.cross <- data.table(read.csv('pre_crossfile_fixed.csv', stringsAsFactors = FALSE, na.strings = c('?', 'Uncallable', 'Bad')))
all(pre.cross$match == TRUE)

## [1] TRUE

claimed.BL <- pre.cross[str_count(string = strain, pattern = 'B') > 0, ]
claimed.FP <- pre.cross[str_count(string = strain, pattern = 'F') > 0, ]

num.non.snp.cols <- 6
genotyped.snp.names <- sapply(X = strsplit(x = names(claimed.BL)[-(1:num.non.snp.cols)], split = '.', fixed = TRUE), FUN = '[', 2)

setnames(x = claimed.BL, 
         old = names(claimed.BL)[-(1:num.non.snp.cols)],
         new = genotyped.snp.names)

setnames(x = claimed.FP, 
         old = names(claimed.FP)[-(1:num.non.snp.cols)],
         new = genotyped.snp.names)

print(claimed.BL[,1:8, with = FALSE])

##      mouse.id strain sex dist.trav dna.assay match 059444286 132510525
##   1:    11116   BLBL   F   6352.72     11116  TRUE       A:G       C:G
##   2:    11117   BLBL   F   7705.94     11117  TRUE       A:A       C:C
##   3:    11118   BLBL   F   6528.31     11118  TRUE       A:G       C:G
##   4:    11119   BLBL   F   4730.00     11119  TRUE       A:G       G:G
##   5:    11120   BLBL   F   6456.38     11120  TRUE       A:G       G:G
##  ---                                                                  
## 545:    17378   LBBL   M   4724.14     17378  TRUE       A:G       C:G
## 546:    17379   LBBL   M   1796.70     17379  TRUE       A:G       G:G
## 547:    17380   LBBL   M   4916.99     17380  TRUE       A:G       C:C
## 548:    17381   LBBL   M   7770.99     17381  TRUE       A:G       G:G
## 549:    17382   LBBL   M   3537.35     17382  TRUE       A:A       C:C

print(claimed.FP[,1:8, with = FALSE])

##      mouse.id strain sex dist.trav dna.assay match 059444286 132510525
##   1:    11151   FPFP   F   8519.77     11151  TRUE       A:G       G:G
##   2:    11152   FPFP   F  11594.82     11152  TRUE       G:G       C:G
##   3:    11153   FPFP   F   7268.83     11153  TRUE       A:G       G:G
##   4:    11154   FPFP   F   6060.79     11154  TRUE       G:G       C:C
##   5:    11155   FPFP   F   5023.55     11155  TRUE       A:G       C:G
##  ---                                                                  
## 527:    17269   PFFP   F   7853.73     17269  TRUE       A:G       C:G
## 528:    17270   PFFP   F  10134.61     17270  TRUE       A:G       C:G
## 529:    17271   PFFP   M  10985.48     17271  TRUE       A:G       C:G
## 530:    17272   PFFP   M   7764.47     17272  TRUE       A:G       C:G
## 531:    17273   PFFP   M   5566.57     17273  TRUE       A:G       C:G

snp.info <- data.table(read.csv('SNPs_Lisa Tarantino project_RWC.csv', stringsAsFactors = FALSE))
snp.info <- snp.info[, snp.id := sapply(X = strsplit(x = snp.id, split = '-'), FUN = '[', 2)]
setnames(x = snp.info, 'CHR', 'chr')

print(snp.info)

##         snp.id chr    pos      rsid B L P F
##   1: 009072542   1   9.01 rs3714728 A C C A
##   2: 032969697   1  32.57 rs3707642 G T G T
##   3: 043973192   1  43.51 rs4136282 C A A C
##   4: 059444286   1  58.90 rs4222382 G A A G
##   5: 080181487   1  79.30 rs3677697 T A T A
##  ---                                       
## 179: 063529554   X  77.78 rs3663982 C A A C
## 180: 067878326   X  82.13 rs3675552 A C C A
## 181: 088983890   X 103.33 rs3694287 T C T C
## 182: 115587169   X 132.75 rs4151956 T C C T
## 183: 123551831   X 140.75 rs3723498 A G A G

Check for claimed BL mice that would be more consistent with FP

# for each mouse
for (m.id in claimed.BL$mouse.id) {
  
  # reset counters
  this.mouse <- claimed.BL[mouse.id == m.id,]
  consistent.w.BL <- 0
  consistent.w.FP <- 0
  
  # for each snp
  for (snp.name in names(claimed.BL)[-(1:num.non.snp.cols)]) {
    
    B.like <- L.like <- F.like <- P.like <- 0
    
    if (!is.na(snp.info[snp.id == snp.name, B])) { 
      B.like <- str_count(string = this.mouse[[snp.name]], 
                          pattern = snp.info[snp.id == snp.name, B])
    }
    if (!is.na(snp.info[snp.id == snp.name, L]) &
          snp.info[snp.id == snp.name, B] != snp.info[snp.id == snp.name, L]) {
      L.like <- str_count(string = this.mouse[[snp.name]],
                          pattern = snp.info[snp.id == snp.name, L])
    }
    if (!is.na(snp.info[snp.id == snp.name, F])) {
      F.like <- str_count(string = this.mouse[[snp.name]],
                          pattern = snp.info[snp.id == snp.name, F])
    }
    if (!is.na(snp.info[snp.id == snp.name, P]) &
          snp.info[snp.id == snp.name, F] != snp.info[snp.id == snp.name, P]) {
      P.like <- str_count(string = this.mouse[[snp.name]],
                          pattern = snp.info[snp.id == snp.name, P])
    }
    
    consistent.w.BL <- consistent.w.BL + B.like + L.like
    consistent.w.FP <- consistent.w.FP + F.like + P.like
  }
  
  if (consistent.w.FP >= consistent.w.BL) {
    print(paste(m.id, 'BL:', consistent.w.BL, 'FP:', consistent.w.FP))
  }
}

Check for claimed FP mice that would be more consistent with BL

for (m.id in claimed.FP$mouse.id) {
  
  this.mouse <- claimed.FP[mouse.id == m.id,]
  consistent.w.BL <- 0
  consistent.w.FP <- 0
  
  for (snp.name in names(claimed.BL)[-(1:num.non.snp.cols)]) {
    
    B.like <- L.like <- F.like <- P.like <- 0
    
    if (!is.na(snp.info[snp.id == snp.name, B])) {
      B.like <- str_count(string = this.mouse[[snp.name]], 
                          pattern = snp.info[snp.id == snp.name, B])
    }
    if (!is.na(snp.info[snp.id == snp.name, L]) & 
          snp.info[snp.id == snp.name, B] != snp.info[snp.id == snp.name, L]) {
      L.like <- str_count(string = this.mouse[[snp.name]], 
                          pattern = snp.info[snp.id == snp.name, L])
    }
    if (!is.na(snp.info[snp.id == snp.name, F])) {
      F.like <- str_count(string = this.mouse[[snp.name]], 
                          pattern = snp.info[snp.id == snp.name, F])
    }
    if (!is.na(snp.info[snp.id == snp.name, P]) & 
          snp.info[snp.id == snp.name, F] != snp.info[snp.id == snp.name, P]) {
      P.like <- str_count(string = this.mouse[[snp.name]], 
                          pattern = snp.info[snp.id == snp.name, P])
    }
    
    consistent.w.BL <- consistent.w.BL + B.like + L.like
    consistent.w.FP <- consistent.w.FP + F.like + P.like
  }
  
  if (consistent.w.BL >= consistent.w.FP) {
    print(paste(m.id, 'BL:', consistent.w.BL, 'FP:', consistent.w.FP))
  }
}

Check for Lab Mixups

Robert Corty

January 19, 2015

Read in data and standardize snp names

Check for claimed BL mice that would be more consistent with FP

Check for claimed FP mice that would be more consistent with BL