1. Evaluating association by distance to TSS and transcript type
Convert non-canonical gene names
# get all LOC gene names and update them there are not in ENSEMBL database with LOC
symbols <- gsub("LOC","",unique(grep("LOC", geneassoc.hg19$gene,value = T)))
mapping <- select(org.Hs.eg.db, symbols, c("ENTREZID","GENENAME","ENSEMBL","SYMBOL"))
## 'select()' returned 1:many mapping between keys and columns
mapping$SYMBOL[is.na(mapping$SYMBOL)] <- "NA"
aux <- geneassoc.hg19
idx <- grep('LOC',aux$gene)
aux$gene[idx] <- mapping$SYMBOL[match(gsub('LOC','', aux$gene[idx]), mapping$ENTREZID)]
aux$gene <- as.character(HGNChelper::checkGeneSymbols(aux$gene)$Suggested.Symbol)
## Warning in HGNChelper::checkGeneSymbols(aux$gene): Some lower-case letters were found and converted to upper-case.
## HGNChelper is intended for human symbols only, which should be all
## upper-case except for open reading frames (orf).
## Warning in HGNChelper::checkGeneSymbols(aux$gene): x contains non-approved
## gene symbols
geneassoc.hg19 <- aux
Bind hg38 (GENCODEv22) and hg19 (RefGene 2010)
geneassoc.hg38 %>%
mutate(posClass=as.character(cut(
position, breaks = c(-1500,-200,0,200,1500, Inf), include.lowest = T,
labels = c('[-1500,-200] upstream TSS',
'(-200, 0] upstream TSS',
'(0, 200] downstream TSS',
'(200, 1500] downstream TSS',
'>1500 downstream TSS')))) %>%
filter(geneType != 'TEC') %>%
full_join(geneassoc.hg19, by=c('probe','gene')) %>%
distinct(probe, gene, tclass, geneType, posClass) %>%
dplyr::select(gene, posClass, tclass, geneType) %>%
group_by(gene, posClass, tclass) %>%
summarise(hg38type=dplyr::first(geneType)) %>%
dplyr::rename(hg38pos=posClass, hg19=tclass) %>%
as.data.frame %>%
mutate(
hg19 = replace_na(hg19, 'Not Associated in hg19'),
hg38type = replace_na(hg38type, 'Not Associated in hg38 Genetype'),
hg38pos = replace_na(hg38pos, 'Not Associated in hg38 position')) %>%
group_by(hg19, hg38type, hg38pos) %>%
summarise(count=n()) %>%
filter(count > 200) %>%
as.data.frame -> df
Sankey Diagram
nodes <- data.frame(name=c(unique(df$hg19), unique(df$hg38pos), unique(df$hg38type)))
# turn names to numbers (IDs)
name2ID <- setNames(0:(nrow(nodes)-1), nodes$name)
df %>% mutate(source = name2ID[hg38pos], target = name2ID[hg19]) -> df.left
df %>% mutate(source = name2ID[hg19], target = name2ID[hg38type]) -> df.right
df.links <- as.data.frame(
rbind(
df.left %>% dplyr::select(source,target,count) %>%
group_by(source, target) %>% summarise(count=sum(count)),
df.right %>% dplyr::select(source,target,count) %>%
group_by(source, target) %>% summarise(count=sum(count))))
# a few renaming
nodes %>% mutate(
name = replace(name, name=='Not Associated in hg38 Genetype', 'Not Associated'),
name = replace(name, name=='Not Associated in hg38 position', 'Not Associated'),
name = replace(name, name=='Not Associated in hg19', 'Not Associated')) -> nodes
sankeyNetwork(
Links = df.links, Nodes=nodes,
Source='source', Target='target', Value='count',
NodeID='name', fontSize=13, nodeWidth=30,
height=1100, width=1000, fontFamily = 'Arial')
2. Compare genes captured in hg19 and genes captured in hg38
geneassoc.hg38 %>%
mutate(posClass=as.character(cut(
position, breaks = c(-1500,-200,0,200,1500, Inf), include.lowest = T,
labels = c(
'[-1500,-200] upstream TSS', '(-200, 0] upstream TSS',
'(0, 200] downstream TSS', '(200, 1500] downstream TSS', '>1500 downstream TSS')))) %>%
filter(geneType != 'TEC') %>%
group_by(gene, posClass) %>%
summarise(hg38type=dplyr::first(geneType), hg38.count=n()) %>%
as.data.frame -> df.hg38
geneassoc.hg19 %>%
group_by(gene, tclass) %>%
summarise(hg19.count=n()) %>%
as.data.frame %>% full_join(df.hg38, by='gene') -> df
df$tclass[is.na(df$tclass)] <- 'Not Associated'
df$tclass <- factor(
df$tclass,
levels = c('TSS1500', 'TSS200',"5'UTR",'1stExon','Body',"3'UTR", 'Not Associated'))
df$posClass <- factor(
df$posClass,
levels = c(
'[-1500,-200] upstream TSS', '(-200, 0] upstream TSS',
'(0, 200] downstream TSS', '(200, 1500] downstream TSS', '>1500 downstream TSS'))
# focus only on big categories
df %>% filter(hg38type %in% c(
'protein_coding','antisense', 'miRNA','lincRNA','processed_pseudogene','snoRNA')) -> df
df$hg38type <- factor(df$hg38type, levels = c(
'protein_coding','antisense', 'lincRNA','processed_pseudogene','miRNA','snoRNA'))
df %>%
group_by(tclass, posClass, hg38type) %>%
summarise(frac=n()/length(unique(df$gene))) -> df1
ggplot(df1) +
geom_bar(aes(tclass, frac, fill=posClass), position='dodge', stat='identity') +
ylab('Number of Genes Captured in hg38') + xlab('Genes Captured in hg19') +
scale_fill_discrete(guide=guide_legend(title="Distance to TSS (hg38)")) +
facet_wrap(~hg38type) + theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
