1. Evaluating association by distance to TSS and transcript type

Convert non-canonical gene names

# get all LOC gene names and update them there are not in ENSEMBL database with LOC 
symbols <- gsub("LOC","",unique(grep("LOC", geneassoc.hg19$gene,value = T)))
mapping <- select(org.Hs.eg.db, symbols, c("ENTREZID","GENENAME","ENSEMBL","SYMBOL"))

## 'select()' returned 1:many mapping between keys and columns

mapping$SYMBOL[is.na(mapping$SYMBOL)] <- "NA"
aux <- geneassoc.hg19
idx <- grep('LOC',aux$gene)
aux$gene[idx] <- mapping$SYMBOL[match(gsub('LOC','', aux$gene[idx]), mapping$ENTREZID)]
aux$gene <- as.character(HGNChelper::checkGeneSymbols(aux$gene)$Suggested.Symbol)

## Warning in HGNChelper::checkGeneSymbols(aux$gene): Some lower-case letters were found and converted to upper-case.
##                  HGNChelper is intended for human symbols only, which should be all
##                  upper-case except for open reading frames (orf).

## Warning in HGNChelper::checkGeneSymbols(aux$gene): x contains non-approved
## gene symbols

geneassoc.hg19 <- aux

Bind hg38 (GENCODEv22) and hg19 (RefGene 2010)

geneassoc.hg38 %>%
  mutate(posClass=as.character(cut(
    position, breaks = c(-1500,-200,0,200,1500, Inf), include.lowest = T,
    labels = c('[-1500,-200] upstream TSS',
               '(-200, 0] upstream TSS',
               '(0, 200] downstream TSS',
               '(200, 1500] downstream TSS', 
               '>1500 downstream TSS')))) %>%
  filter(geneType != 'TEC') %>%
  full_join(geneassoc.hg19, by=c('probe','gene')) %>%
  distinct(probe, gene, tclass, geneType, posClass) %>% 
  dplyr::select(gene, posClass, tclass, geneType) %>%
  group_by(gene, posClass, tclass) %>%
  summarise(hg38type=dplyr::first(geneType)) %>%
  dplyr::rename(hg38pos=posClass, hg19=tclass) %>%
  as.data.frame %>%
  mutate(
    hg19 = replace_na(hg19, 'Not Associated in hg19'),
    hg38type = replace_na(hg38type, 'Not Associated in hg38 Genetype'),
    hg38pos = replace_na(hg38pos, 'Not Associated in hg38 position')) %>% 
  group_by(hg19, hg38type, hg38pos) %>%
  summarise(count=n()) %>%
  filter(count > 200) %>%
  as.data.frame -> df

Sankey Diagram

nodes <- data.frame(name=c(unique(df$hg19), unique(df$hg38pos), unique(df$hg38type)))
# turn names to numbers (IDs)
name2ID <- setNames(0:(nrow(nodes)-1), nodes$name)
df %>% mutate(source = name2ID[hg38pos], target = name2ID[hg19]) -> df.left
df %>% mutate(source = name2ID[hg19], target = name2ID[hg38type]) -> df.right
df.links <- as.data.frame(
  rbind(
    df.left %>% dplyr::select(source,target,count) %>% 
      group_by(source, target) %>% summarise(count=sum(count)),
    df.right %>% dplyr::select(source,target,count) %>% 
      group_by(source, target) %>% summarise(count=sum(count))))

# a few renaming
nodes %>% mutate(
  name = replace(name, name=='Not Associated in hg38 Genetype', 'Not Associated'),
  name = replace(name, name=='Not Associated in hg38 position', 'Not Associated'),
  name = replace(name, name=='Not Associated in hg19', 'Not Associated')) -> nodes

sankeyNetwork(
  Links = df.links, Nodes=nodes, 
  Source='source', Target='target', Value='count', 
  NodeID='name', fontSize=13, nodeWidth=30, 
  height=1100, width=1000, fontFamily = 'Arial')

2. Compare genes captured in hg19 and genes captured in hg38

geneassoc.hg38 %>%
  mutate(posClass=as.character(cut(
    position, breaks = c(-1500,-200,0,200,1500, Inf), include.lowest = T,
    labels = c(
      '[-1500,-200] upstream TSS', '(-200, 0] upstream TSS',
      '(0, 200] downstream TSS', '(200, 1500] downstream TSS', '>1500 downstream TSS')))) %>%
  filter(geneType != 'TEC') %>%
  group_by(gene, posClass) %>%
  summarise(hg38type=dplyr::first(geneType), hg38.count=n()) %>%
  as.data.frame -> df.hg38

geneassoc.hg19 %>%
  group_by(gene, tclass) %>%
  summarise(hg19.count=n()) %>% 
  as.data.frame %>% full_join(df.hg38, by='gene') -> df

df$tclass[is.na(df$tclass)] <- 'Not Associated'
df$tclass <- factor(
  df$tclass, 
  levels = c('TSS1500', 'TSS200',"5'UTR",'1stExon','Body',"3'UTR", 'Not Associated'))
df$posClass <- factor(
  df$posClass, 
  levels = c(
    '[-1500,-200] upstream TSS', '(-200, 0] upstream TSS',
    '(0, 200] downstream TSS', '(200, 1500] downstream TSS', '>1500 downstream TSS'))

# focus only on big categories
df %>% filter(hg38type %in% c(
  'protein_coding','antisense', 'miRNA','lincRNA','processed_pseudogene','snoRNA')) -> df
df$hg38type <- factor(df$hg38type, levels = c(
  'protein_coding','antisense', 'lincRNA','processed_pseudogene','miRNA','snoRNA'))
df %>% 
  group_by(tclass, posClass, hg38type) %>% 
  summarise(frac=n()/length(unique(df$gene))) -> df1

ggplot(df1) + 
  geom_bar(aes(tclass, frac, fill=posClass), position='dodge', stat='identity') + 
  ylab('Number of Genes Captured in hg38') + xlab('Genes Captured in hg19') + 
  scale_fill_discrete(guide=guide_legend(title="Distance to TSS (hg38)")) + 
  facet_wrap(~hg38type) + theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))

GDAN - DNA methylation (legacy vs GDC release 6.0) - Probe-Gene Association Comparison

1. Evaluating association by distance to TSS and transcript type

Convert non-canonical gene names

Bind hg38 (GENCODEv22) and hg19 (RefGene 2010)

Sankey Diagram

2. Compare genes captured in hg19 and genes captured in hg38