GRanges column names from the datalibrary(GenomicRanges)Create a small example dataset with minimum necessary variables:
exampleData <- data.frame(seqnames = paste0("Chr", 1:5),
Start_position = 1:5,
End_position = 4:8,
Strand = "+",
stringsAsFactors = FALSE)
names(exampleData)## [1] "seqnames" "Start_position" "End_position" "Strand"
Finding GRanges columns with this dataset does not work because only prefixes are sought.
err <- try(GenomicRanges:::.find_GRanges_cols(names(exampleData)))
attr(err, "condition")## <simpleError in .find_start_end_cols(df_colnames0, start.field0, end.field0): cannnot determine start/end columns>
The error originates in the .find_start_end_cols helper function.
GenomicRanges:::.find_start_end_cols## function (df_colnames, start.field, end.field)
## {
## idx1 <- which(df_colnames %in% start.field)
## idx2 <- which(df_colnames %in% end.field)
## if (length(idx1) == 1L && length(idx2) == 1L)
## return(list(c(start = idx1, end = idx2), ""))
## if (length(idx1) == 0L && length(idx2) == 0L) {
## prefixes1 <- .collect_prefixes(df_colnames, start.field)
## prefixes2 <- .collect_prefixes(df_colnames, end.field)
## if (length(prefixes1) == 1L && length(prefixes2) == 1L &&
## prefixes1 == prefixes2) {
## prefix <- prefixes1
## idx1 <- which(df_colnames %in% paste0(prefix, start.field))
## idx2 <- which(df_colnames %in% paste0(prefix, end.field))
## if (length(idx1) == 1L && length(idx2) == 1L)
## return(list(c(start = idx1, end = idx2), prefix))
## }
## }
## stop("cannnot determine start/end columns")
## }
## <environment: namespace:GenomicRanges>
The GenomicRanges:::.collect_prefixes function only looks for prefixes.
Adding a GenomicRanges:::.collect_suffixes function to obtain column names that may start with the start.field and end.field (e.g., Start_position, End_position).
.collect_suffixes <- function(df_colnames, field) {
suffixes <- lapply(field, function(pre) {
idx <- which(startsWith(df_colnames, pre))
substr(df_colnames[idx], nchar(field) + 1L,
nchar(df_colnames[idx]))
})
unique(unlist(suffixes))
}
.collect_suffixes(tolower(names(exampleData)), "start")## [1] "_position"
The revised function will return the appropriate indices when the column name starts with the start.field or end.field.
names(exampleData)## [1] "seqnames" "Start_position" "End_position" "Strand"
.find_start_end_cols2(tolower(names(exampleData)), "start", "end")## [[1]]
## start end
## 2 3
##
## [[2]]
## [1] ""
For multiple matches, we may want to set up a rule like that in match where only the first matches are returned.
With the current function we get an error:
exDat <- cbind(exampleData,
data.frame(chr = paste0("chr", 1:5),
TxStart = 1:5, TxEnd = 4:8, StartTx = 1:5, EndTx = 1:5))
names(exDat)## [1] "seqnames" "Start_position" "End_position" "Strand"
## [5] "chr" "TxStart" "TxEnd" "StartTx"
## [9] "EndTx"
multiMatchError <- try(GenomicRanges:::.find_GRanges_cols(names(exDat)))
attr(multiMatchError, "condition")## <simpleError in .find_seqnames_col(df_colnames0, seqnames.field0, prefix): cannnot determine seqnames column unambiguously>
Revised function gives the user a warning if there are multiple matches and prefers prefixes to suffixes.
Columns with prefixes are prefered here (i.e., ‘TxStart’ and ‘TxEnd’)
names(exDat)## [1] "seqnames" "Start_position" "End_position" "Strand"
## [5] "chr" "TxStart" "TxEnd" "StartTx"
## [9] "EndTx"
.find_start_end_cols2(tolower(names(exDat)), "start", "end")## [[1]]
## start end
## 6 7
##
## [[2]]
## [1] "tx"
exDat2 <- cbind(exampleData,
data.frame(chr = paste0("chr", 1:5),
StartTx = 1:5, EndTx = 1:5))
names(exDat2)## [1] "seqnames" "Start_position" "End_position" "Strand"
## [5] "chr" "StartTx" "EndTx"
.find_start_end_cols2(tolower(names(exDat2)), "start", "end")## Warning in .find_start_end_cols2(tolower(names(exDat2)), "start", "end"):
## multiple suffixes found, using first match
## [[1]]
## start end
## 2 3
##
## [[2]]
## [1] ""
This rule of taking the first match (or index) was implemented for GenomicRanges:::.find_width_col and could also be implemented for GenomicRanges:::.find_seqnames_col.