Generate sql database from Maxquant output

Maxquant produces quite convoluted output in tab delimited files. To get a first idea obout those files I would like to read them into a sql database and explore them using sqlitestudio.

To populate the database I am going to use R and the package dplyr.

Collect meta information about the files

suppressMessages(library(dplyr))

path <- "output/20150930/combined/txt/"
files <- dir(path)
filesf <- dir(path, full.names = T)
files <- data_frame(files,filesf)
info <- file.info(files$filesf)
stopifnot(files$filesf == rownames(info))
files <- tbl_df(data.frame(files, info))
glimpse(files)
## Observations: 20
## Variables: 9
## $ files  (chr) "aifMsms.txt", "allPeptides.txt", "Arg10_StandardSites....
## $ filesf (chr) "output/20150930/combined/txt/aifMsms.txt", "output/201...
## $ size   (dbl) 0, 107412695, 129699, 3179471, 0, 186883, 0, 251804, 0,...
## $ isdir  (lgl) FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,...
## $ mode   (octm) 666, 666, 666, 666, 666, 666, 666, 666, 666, 666, 666,...
## $ mtime  (time) 2015-09-30 16:52:38, 2015-09-30 16:56:51, 2015-09-30 1...
## $ ctime  (time) 2015-09-30 11:51:04, 2015-09-30 11:52:29, 2015-09-30 1...
## $ atime  (time) 2015-09-30 11:51:04, 2015-09-30 16:54:06, 2015-09-30 1...
## $ exe    (chr) "no", "no", "no", "no", "no", "no", "no", "no", "no", "...

Select text files with size greater than 0.

files <- filter(files, size > 0)
files <- files[ grep("\\.txt",files$files),]

Read all the files into R

mqdata <-list()
for(file in files$filesf){
  mqdata[[file]] <- read.csv(file,sep="\t",stringsAsFactors = FALSE)
}
stopifnot(names(mqdata) == files$filesf)

Dump them into an sqlite database

Before doing it you need to fix the database files which we derive from the file names. As a collegue of mine says if you see a file name such as “Oxidation (M)Sites.txt” you know where they came from. It is a lonely place.

names <- files$files
names <- gsub("\\.txt", "",files$files)
names <- gsub("[[:space:]]", "", names) 
names <- gsub("[[:punct:]]", "", names)
names(mqdata) <- names

library(DBI)
if(file.exists(file.path(path,"my_mqdb.sqlite3"))){
  file.remove(file.path(path,"my_mqdb.sqlite3"))
}
  
my_mqdb <- src_sqlite(file.path(path,"my_mqdb.sqlite3"), create=TRUE)

for( i in 1:length(mqdata)){
  tmp<-copy_to(my_mqdb, mqdata[[i]], name=names(mqdata)[i], temporary = FALSE)
}
dbDisconnect(my_mqdb$con)

Now you are ready to explore the data with SQLiteStudio.

For more interesting information about Data Science in Mass Spectrometry visit http://aggrivet.blogspot.ch/.