0.1 Finding GRanges column names from the data

library(GenomicRanges)

Create a small example dataset with minimum necessary variables:

exampleData <- data.frame(seqnames = paste0("Chr", 1:5),
                          Start_position = 1:5,
                          End_position = 4:8,
                          Strand = "+",
                          stringsAsFactors = FALSE)
names(exampleData)
## [1] "seqnames"       "Start_position" "End_position"   "Strand"

Finding GRanges columns with this dataset does not work because only prefixes are sought.

err <- try(GenomicRanges:::.find_GRanges_cols(names(exampleData)))
attr(err, "condition")
## <simpleError in .find_start_end_cols(df_colnames0, start.field0, end.field0): cannnot determine start/end columns>

The error originates in the .find_start_end_cols helper function.

GenomicRanges:::.find_start_end_cols
## function (df_colnames, start.field, end.field) 
## {
##     idx1 <- which(df_colnames %in% start.field)
##     idx2 <- which(df_colnames %in% end.field)
##     if (length(idx1) == 1L && length(idx2) == 1L) 
##         return(list(c(start = idx1, end = idx2), ""))
##     if (length(idx1) == 0L && length(idx2) == 0L) {
##         prefixes1 <- .collect_prefixes(df_colnames, start.field)
##         prefixes2 <- .collect_prefixes(df_colnames, end.field)
##         if (length(prefixes1) == 1L && length(prefixes2) == 1L && 
##             prefixes1 == prefixes2) {
##             prefix <- prefixes1
##             idx1 <- which(df_colnames %in% paste0(prefix, start.field))
##             idx2 <- which(df_colnames %in% paste0(prefix, end.field))
##             if (length(idx1) == 1L && length(idx2) == 1L) 
##                 return(list(c(start = idx1, end = idx2), prefix))
##         }
##     }
##     stop("cannnot determine start/end columns")
## }
## <environment: namespace:GenomicRanges>

The GenomicRanges:::.collect_prefixes function only looks for prefixes.

0.1.0.1 Suggestion

Adding a GenomicRanges:::.collect_suffixes function to obtain column names that may start with the start.field and end.field (e.g., Start_position, End_position).

.collect_suffixes <- function(df_colnames, field) {
    suffixes <- lapply(field, function(pre) {
    idx <- which(startsWith(df_colnames, pre))
    substr(df_colnames[idx], nchar(field) + 1L,
           nchar(df_colnames[idx]))
    })
    unique(unlist(suffixes))
}

.collect_suffixes(tolower(names(exampleData)), "start")
## [1] "_position"

The revised function will return the appropriate indices when the column name starts with the start.field or end.field.

names(exampleData)
## [1] "seqnames"       "Start_position" "End_position"   "Strand"
.find_start_end_cols2(tolower(names(exampleData)), "start", "end")
## [[1]]
## start   end 
##     2     3 
## 
## [[2]]
## [1] ""

For multiple matches, we may want to set up a rule like that in match where only the first matches are returned.

With the current function we get an error:

exDat <- cbind(exampleData,
               data.frame(chr = paste0("chr", 1:5),
                          TxStart = 1:5, TxEnd = 4:8, StartTx = 1:5, EndTx = 1:5))
names(exDat)
## [1] "seqnames"       "Start_position" "End_position"   "Strand"        
## [5] "chr"            "TxStart"        "TxEnd"          "StartTx"       
## [9] "EndTx"
multiMatchError <- try(GenomicRanges:::.find_GRanges_cols(names(exDat)))
attr(multiMatchError, "condition")
## <simpleError in .find_seqnames_col(df_colnames0, seqnames.field0, prefix): cannnot determine seqnames column unambiguously>

Revised function gives the user a warning if there are multiple matches and prefers prefixes to suffixes.

Columns with prefixes are prefered here (i.e., ‘TxStart’ and ‘TxEnd’)

names(exDat)
## [1] "seqnames"       "Start_position" "End_position"   "Strand"        
## [5] "chr"            "TxStart"        "TxEnd"          "StartTx"       
## [9] "EndTx"
.find_start_end_cols2(tolower(names(exDat)), "start", "end")
## [[1]]
## start   end 
##     6     7 
## 
## [[2]]
## [1] "tx"
exDat2 <- cbind(exampleData,
               data.frame(chr = paste0("chr", 1:5),
                          StartTx = 1:5, EndTx = 1:5))
names(exDat2)
## [1] "seqnames"       "Start_position" "End_position"   "Strand"        
## [5] "chr"            "StartTx"        "EndTx"
.find_start_end_cols2(tolower(names(exDat2)), "start", "end")
## Warning in .find_start_end_cols2(tolower(names(exDat2)), "start", "end"):
## multiple suffixes found, using first match
## [[1]]
## start   end 
##     2     3 
## 
## [[2]]
## [1] ""

This rule of taking the first match (or index) was implemented for GenomicRanges:::.find_width_col and could also be implemented for GenomicRanges:::.find_seqnames_col.