This package contains only one function correctCounts as to assign multiply-mapped-reads to the most abundant gene that it mapped to.
library(correctMultiCount)
data(baseCount)
data(multiCount)
head(baseCount)
## id count
## 1 ENSG00000001626 1
## 2 ENSG00000002586 1
## 3 ENSG00000002587 1
## 4 ENSG00000002746 1
## 5 ENSG00000002822 1
## 6 ENSG00000003147 1
head(multiCount)
## fragment_id gene_id
## 1 NS500358:89:HJWK2BGXX:1:11101:4921:20051 ENSG00000131584
## 2 NS500358:89:HJWK2BGXX:1:11101:22227:18777 ENSG00000248333
## 3 NS500358:89:HJWK2BGXX:1:11101:22227:18777 ENSG00000008128
## 4 NS500358:89:HJWK2BGXX:1:11101:22227:18777 ENSG00000268575
## 5 NS500358:89:HJWK2BGXX:1:11101:4437:19164 ENSG00000157933
## 6 NS500358:89:HJWK2BGXX:1:11101:24100:3433 ENSG00000171735
Noted that this function needed to two data frames with the exact column names as shown here.
df <- correctCounts(baseCount,multiCount)
head(df)
## id count
## 1 cluster45 1
## 2 cluster35 1
## 3 cluster19 1
## 4 cluster18 1
## 5 ENSG00000278655 1
## 6 ENSG00000278259 1
Now, lets check if it works as we thought.
library(dplyr)
compareDF <- df %>%
setNames(c('id','newCount')) %>%
inner_join(baseCount) %>% # merge the old and new count dataframe
tbl_df
And find out the fragments that mapped to at least two locus
multiCount %>%
group_by(fragment_id) %>%
summarize(mapped_location_count = n()) %>%
filter(mapped_location_count > 1) %>%
arrange(-mapped_location_count) %>%
tbl_df
## Source: local data frame [50 x 2]
##
## fragment_id mapped_location_count
## (chr) (int)
## 1 NS500358:89:HJWK2BGXX:1:11102:24483:2032 4
## 2 NS500358:89:HJWK2BGXX:1:11101:15507:16834 3
## 3 NS500358:89:HJWK2BGXX:1:11101:22227:18777 3
## 4 NS500358:89:HJWK2BGXX:1:11101:22716:6217 3
## 5 NS500358:89:HJWK2BGXX:1:11101:24133:6674 3
## 6 NS500358:89:HJWK2BGXX:1:11101:7121:16523 3
## 7 NS500358:89:HJWK2BGXX:1:11101:11170:5964 2
## 8 NS500358:89:HJWK2BGXX:1:11101:11988:19460 2
## 9 NS500358:89:HJWK2BGXX:1:11101:12283:11997 2
## 10 NS500358:89:HJWK2BGXX:1:11101:12915:10280 2
## .. ... ...
Lets check the third fragment
fragment = multiCount %>%
filter(fragment_id=='NS500358:89:HJWK2BGXX:1:11101:22227:18777') %>%
tbl_df
head(fragment)
## Source: local data frame [3 x 2]
##
## fragment_id gene_id
## (chr) (chr)
## 1 NS500358:89:HJWK2BGXX:1:11101:22227:18777 ENSG00000248333
## 2 NS500358:89:HJWK2BGXX:1:11101:22227:18777 ENSG00000008128
## 3 NS500358:89:HJWK2BGXX:1:11101:22227:18777 ENSG00000268575
fragment_mapped_gene <- fragment$gene_id
compareDF %>%
filter(id %in% fragment_mapped_gene)
## Source: local data frame [1 x 3]
##
## id newCount count
## (chr) (int) (int)
## 1 ENSG00000248333 2 1
So here you see, the multiply-mapped gene is assign to the only gene locus that originally has count.