Problem definition

In mass spectrometry you frequently need to match points in RT and MZ with some measurement error.

Generate sample data

tmp1 <- cbind(sample(letters,1000,replace=TRUE),sample(letters,1000,replace=TRUE),sample(letters,1000,replace=TRUE))
tmp2 <- cbind(sample(letters,1000,replace=TRUE),sample(letters,1000,replace=TRUE),sample(letters,1000,replace=TRUE))

tmp1<-apply(tmp1, 1, paste ,collapse="")
tmp2<-apply(tmp2, 1, paste ,collapse="")

nrseq1 <- 1:1000
nrseq2 <- (rnorm(1000, mean=1000,sd=100))

tab1 <- data.frame(tmp1, nrseq1 + rnorm(1000,0,1.5), nrseq2 + rnorm(1000,0,0.4))
tab2 <-  data.frame(tmp2,  nrseq1 + rnorm(1000,0,1.5), nrseq2 + rnorm(1000,0,0.4) )

colnames(tab1) <- c("name","rt","mz")

tab2 <-  data.frame(tmp2,  nrseq1 + rnorm(1000,0,1.5), nrseq2 + rnorm(1000,0,0.4) )
colnames(tab2) <- c("name","rt","mz")

Add 4 columns with error range for mz and rt to one of the tables

mzerror <- 1
rterror <- 2.5
tab2<-data.frame(tab2, rtmi = tab2$rt - rterror , rtma = tab2$rt + rterror , mzmi = tab2$mz - mzerror, mzma = tab2$mz + mzerror )

Using DBI and dplyr

Dump both dataframes into a sqlite database using dplyr

suppressMessages(library(dplyr))
suppressMessages(library(DBI))

dbfile <- "my_mqdb.sqlite3"
if(file.exists(dbfile)){
  file.remove(dbfile)
}
## [1] TRUE
my_mqdb <- src_sqlite(dbfile, create=TRUE)
copy_to(my_mqdb, tab1,  temporary = FALSE)
## Source: sqlite 3.8.6 [my_mqdb.sqlite3]
## From: tab1 [1,000 x 3]
## 
##     name       rt        mz
##    (chr)    (dbl)     (dbl)
## 1    gfc 2.460138 1047.6260
## 2    rnx 3.214221  878.4476
## 3    gaj 4.190572 1019.8799
## 4    tfu 5.032405  999.7834
## 5    nms 5.064113  904.9872
## 6    rtv 5.417524  829.2131
## 7    hmj 8.062583 1055.5779
## 8    kxb 5.765629 1000.4393
## 9    mvv 7.816920  937.1018
## 10   loc 8.177781 1070.6675
## ..   ...      ...       ...
copy_to(my_mqdb, tab2,  temporary = FALSE)
## Source: sqlite 3.8.6 [my_mqdb.sqlite3]
## From: tab2 [1,000 x 7]
## 
##     name        rt        mz       rtmi      rtma      mzmi      mzma
##    (chr)     (dbl)     (dbl)      (dbl)     (dbl)     (dbl)     (dbl)
## 1    ssc  2.786813 1047.8346  0.2868130  5.286813 1046.8346 1048.8346
## 2    mgl  1.606113  878.4946 -0.8938866  4.106113  877.4946  879.4946
## 3    qjz  4.836084 1020.3323  2.3360841  7.336084 1019.3323 1021.3323
## 4    kit  3.795568  999.9929  1.2955676  6.295568  998.9929 1000.9929
## 5    ihk  5.152855  903.6745  2.6528545  7.652855  902.6745  904.6745
## 6    gao  5.978470  828.9304  3.4784703  8.478470  827.9304  829.9304
## 7    dnb  8.959492 1055.9336  6.4594919 11.459492 1054.9336 1056.9336
## 8    idg  9.276355 1001.0328  6.7763554 11.776355 1000.0328 1002.0328
## 9    hxy  7.677646  936.6617  5.1776462 10.177646  935.6617  937.6617
## 10   qzr 10.077482 1071.2388  7.5774821 12.577482 1070.2388 1072.2388
## ..   ...       ...       ...        ...       ...       ...       ...
dbDisconnect(my_mqdb$con)
## [1] TRUE

Join both tables on double columns to find matching features

my_mqdb <- src_sqlite(dbfile)
dbListTables(my_mqdb$con)
## [1] "sqlite_stat1" "tab1"         "tab2"
res <-dbSendQuery(my_mqdb$con , "select tab1.mz, tab1.rt, tab2.mz as tab2mz, tab2.rt as tab2rt from tab1 
                  inner join tab2 on tab1.mz < tab2.mzma and tab1.mz > tab2.mzmi 
                  and tab1.rt < tab2.rtma and tab1.rt > tab2.rtmi")
resData <- dbFetch(res,n=-1)
dbClearResult(res)
## [1] TRUE
dim(resData)
## [1] 722   4
head(resData)
##          mz       rt    tab2mz   tab2rt
## 1 1047.6260 2.460138 1047.8346 2.786813
## 2  878.4476 3.214221  878.4946 1.606113
## 3 1019.8799 4.190572 1020.3323 4.836084
## 4  999.7834 5.032405  999.9929 3.795568
## 5  829.2131 5.417524  828.9304 5.978470
## 6 1055.5779 8.062583 1055.9336 8.959492
plot(resData$rt, resData$mz, xlab="RT", ylab="mz",pch="x" )
points(resData$tab2rt, resData$tab2mz,col=2 )

plot((resData$rt + resData$tab2rt)/2, resData$mz - resData$tab2mz, xlab="rt")

plot(resData$rt - resData$tab2rt, (resData$mz + resData$tab2mz)/2, ylab="mz")

dbDisconnect(my_mqdb$con)
## [1] TRUE

AFAIK you can’t do it as easily with any of the R join methods as base::merge or dplyr::innter_join. But maybee sqldf does it?

Joining using sqldf package

suppressMessages(library(sqldf))
res2 <- sqldf("select tab1.mz, tab1.rt, tab2.mz as tab2mz, tab2.rt as tab2rt from tab1 
                  inner join tab2 on tab1.mz < tab2.mzma and tab1.mz > tab2.mzmi 
                  and tab1.rt < tab2.rtma and tab1.rt > tab2.rtmi")
## Loading required package: tcltk
dim(res2)
## [1] 722   4
head(res2)
##          mz       rt    tab2mz   tab2rt
## 1 1047.6260 2.460138 1047.8346 2.786813
## 2  878.4476 3.214221  878.4946 1.606113
## 3 1019.8799 4.190572 1020.3323 4.836084
## 4  999.7834 5.032405  999.9929 3.795568
## 5  829.2131 5.417524  828.9304 5.978470
## 6 1055.5779 8.062583 1055.9336 8.959492
plot(res2$rt, res2$mz, xlab="RT", ylab="mz",pch="x" )
points(res2$tab2rt, res2$tab2mz,col=2 )