I look at the called genotypes for each sample and evaluate the extent to which called genotypes are consistent with the strain the sample is claimed to have come from.
library(data.table)
library(stringr)
pre.cross <- data.table(read.csv('pre_crossfile_fixed.csv', stringsAsFactors = FALSE, na.strings = c('?', 'Uncallable', 'Bad')))
all(pre.cross$match == TRUE)
## [1] TRUE
claimed.BL <- pre.cross[str_count(string = strain, pattern = 'B') > 0, ]
claimed.FP <- pre.cross[str_count(string = strain, pattern = 'F') > 0, ]
num.non.snp.cols <- 6
genotyped.snp.names <- sapply(X = strsplit(x = names(claimed.BL)[-(1:num.non.snp.cols)], split = '.', fixed = TRUE), FUN = '[', 2)
setnames(x = claimed.BL,
old = names(claimed.BL)[-(1:num.non.snp.cols)],
new = genotyped.snp.names)
setnames(x = claimed.FP,
old = names(claimed.FP)[-(1:num.non.snp.cols)],
new = genotyped.snp.names)
print(claimed.BL[,1:8, with = FALSE])
## mouse.id strain sex dist.trav dna.assay match 059444286 132510525
## 1: 11116 BLBL F 6352.72 11116 TRUE A:G C:G
## 2: 11117 BLBL F 7705.94 11117 TRUE A:A C:C
## 3: 11118 BLBL F 6528.31 11118 TRUE A:G C:G
## 4: 11119 BLBL F 4730.00 11119 TRUE A:G G:G
## 5: 11120 BLBL F 6456.38 11120 TRUE A:G G:G
## ---
## 545: 17378 LBBL M 4724.14 17378 TRUE A:G C:G
## 546: 17379 LBBL M 1796.70 17379 TRUE A:G G:G
## 547: 17380 LBBL M 4916.99 17380 TRUE A:G C:C
## 548: 17381 LBBL M 7770.99 17381 TRUE A:G G:G
## 549: 17382 LBBL M 3537.35 17382 TRUE A:A C:C
print(claimed.FP[,1:8, with = FALSE])
## mouse.id strain sex dist.trav dna.assay match 059444286 132510525
## 1: 11151 FPFP F 8519.77 11151 TRUE A:G G:G
## 2: 11152 FPFP F 11594.82 11152 TRUE G:G C:G
## 3: 11153 FPFP F 7268.83 11153 TRUE A:G G:G
## 4: 11154 FPFP F 6060.79 11154 TRUE G:G C:C
## 5: 11155 FPFP F 5023.55 11155 TRUE A:G C:G
## ---
## 527: 17269 PFFP F 7853.73 17269 TRUE A:G C:G
## 528: 17270 PFFP F 10134.61 17270 TRUE A:G C:G
## 529: 17271 PFFP M 10985.48 17271 TRUE A:G C:G
## 530: 17272 PFFP M 7764.47 17272 TRUE A:G C:G
## 531: 17273 PFFP M 5566.57 17273 TRUE A:G C:G
snp.info <- data.table(read.csv('SNPs_Lisa Tarantino project_RWC.csv', stringsAsFactors = FALSE))
snp.info <- snp.info[, snp.id := sapply(X = strsplit(x = snp.id, split = '-'), FUN = '[', 2)]
setnames(x = snp.info, 'CHR', 'chr')
print(snp.info)
## snp.id chr pos rsid B L P F
## 1: 009072542 1 9.01 rs3714728 A C C A
## 2: 032969697 1 32.57 rs3707642 G T G T
## 3: 043973192 1 43.51 rs4136282 C A A C
## 4: 059444286 1 58.90 rs4222382 G A A G
## 5: 080181487 1 79.30 rs3677697 T A T A
## ---
## 179: 063529554 X 77.78 rs3663982 C A A C
## 180: 067878326 X 82.13 rs3675552 A C C A
## 181: 088983890 X 103.33 rs3694287 T C T C
## 182: 115587169 X 132.75 rs4151956 T C C T
## 183: 123551831 X 140.75 rs3723498 A G A G
# for each mouse
for (m.id in claimed.BL$mouse.id) {
# reset counters
this.mouse <- claimed.BL[mouse.id == m.id,]
consistent.w.BL <- 0
consistent.w.FP <- 0
# for each snp
for (snp.name in names(claimed.BL)[-(1:num.non.snp.cols)]) {
B.like <- L.like <- F.like <- P.like <- 0
if (!is.na(snp.info[snp.id == snp.name, B])) {
B.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, B])
}
if (!is.na(snp.info[snp.id == snp.name, L]) &
snp.info[snp.id == snp.name, B] != snp.info[snp.id == snp.name, L]) {
L.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, L])
}
if (!is.na(snp.info[snp.id == snp.name, F])) {
F.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, F])
}
if (!is.na(snp.info[snp.id == snp.name, P]) &
snp.info[snp.id == snp.name, F] != snp.info[snp.id == snp.name, P]) {
P.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, P])
}
consistent.w.BL <- consistent.w.BL + B.like + L.like
consistent.w.FP <- consistent.w.FP + F.like + P.like
}
if (consistent.w.FP >= consistent.w.BL) {
print(paste(m.id, 'BL:', consistent.w.BL, 'FP:', consistent.w.FP))
}
}
for (m.id in claimed.FP$mouse.id) {
this.mouse <- claimed.FP[mouse.id == m.id,]
consistent.w.BL <- 0
consistent.w.FP <- 0
for (snp.name in names(claimed.BL)[-(1:num.non.snp.cols)]) {
B.like <- L.like <- F.like <- P.like <- 0
if (!is.na(snp.info[snp.id == snp.name, B])) {
B.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, B])
}
if (!is.na(snp.info[snp.id == snp.name, L]) &
snp.info[snp.id == snp.name, B] != snp.info[snp.id == snp.name, L]) {
L.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, L])
}
if (!is.na(snp.info[snp.id == snp.name, F])) {
F.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, F])
}
if (!is.na(snp.info[snp.id == snp.name, P]) &
snp.info[snp.id == snp.name, F] != snp.info[snp.id == snp.name, P]) {
P.like <- str_count(string = this.mouse[[snp.name]],
pattern = snp.info[snp.id == snp.name, P])
}
consistent.w.BL <- consistent.w.BL + B.like + L.like
consistent.w.FP <- consistent.w.FP + F.like + P.like
}
if (consistent.w.BL >= consistent.w.FP) {
print(paste(m.id, 'BL:', consistent.w.BL, 'FP:', consistent.w.FP))
}
}