Contents

1 Gene symbols in MSigDB

1.1 Load MSigDB and Excel-modified gene symbols

Excel-mogrified gene symbols (mog_map.csv) was catalogued by importing all gene symbols into Excel and exporting them in all available date formats. Also, 38,040 gene symbols used in MSigDB database is downloaded and saved in msigdb.v7.0.symbols.gmt.

setwd("/data2/HGNChelper/inst/extdata")

library(HGNChelper)

readGMT <- function (fname, name.column = 1) {
    if (!(name.column == 1 | name.column == 2))
        stop("name.column should be 1 or 2")
    x <- readLines(fname)
    x <- strsplit(x, split = "\t")
    output <- lapply(x, function(y) y[-1:-2])
    names(output) <- make.names(sapply(x, function(y) y[name.column]), unique = TRUE)
    return(output)
}

# gmt.files <- dir(pattern = "\\.gmt$")
# if(length(gmt.files) == 0) stop(paste("No .gmt files found in", getwd()))

# Load all .gmt files
gmt.files <- "msigdb.v7.0.symbols.gmt"
all.sigs <- lapply(gmt.files, readGMT)
all.sigs <- lapply(all.sigs, function(x) unique(unlist(x)))
# all.sigs <- lapply(all.sigs, function(x) x[!grepl("^LOC", x)]) 
names(all.sigs) <- gmt.files
all.sigs$OVERALL <- unique(do.call(c, all.sigs))

# Load Excel-mogrified gene symbols
mog <- read.csv(system.file("extdata", "mog_map.csv", package="HGNChelper"), 
                as.is=TRUE)

1.2 Update using HGNChelper

Fetch the up-to-date human gene symbol map using HGNChelper::getCurrentHumanMap.

CurrentHumanMap = getCurrentHumanMap()
## Fetching gene symbols from ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt

Update gene symbols using HGNChelper and check any Excel-mogrified gene symbol.

simplecheck <- sapply(all.sigs, function(x){
        tmp <- checkGeneSymbols(x, map = CurrentHumanMap)
        output <- c(sum(tmp$Approved)/nrow(tmp), 
                    sum(!is.na(tmp[, 3]))/nrow(tmp), 
                    any(toupper(x) %in% toupper(mog[, 2])))
        names(output) <- c("valid.frac", "valid.after.hgnchelper.frac", "any.excel")
        return(output)
    })
simplecheck <- t(simplecheck)
simplecheck
##                         valid.frac valid.after.hgnchelper.frac any.excel
## msigdb.v7.0.symbols.gmt  0.9779968                   0.9880915         0
## OVERALL                  0.9779968                   0.9880915         0

# checkGeneSymbols
tmp = checkGeneSymbols(all.sigs[[1]], map = CurrentHumanMap)

# the number of invalid gene symbols before
sum(tmp$Approved != "TRUE")   
## [1] 837

# the number of invalid gene symbols after
sum(is.na(tmp$Suggested.Symbol))   
## [1] 453

# Example of gene symbols that can't be corrected by HGNGhelper
unfixables <- sort(tmp[is.na(tmp[, 3]), 1])  
set.seed(1)
sample(unfixables, 10)
##  [1] "AL445686.2" "AC093583.1" "AC069404.1" "AP003781.1" "AL162431.1"
##  [6] "AL035530.2" "AC100756.3" "AL355338.1" "AC018683.1" "AL109810.2"

2 Gene symbols in GPL

2.1 Fetch GPL files and update using HGNChelper

GEO platform information is downloaded from https://www.ncbi.nlm.nih.gov/geo/browse/?view=platforms in five .csv files and combined/saved as platforms.csv. As of March 27th, 2020, there are 20,716 platforms exist in GEO.

Fetch all the gene symbols found in GPL, store in column 1 of platforms.csv.

library(GEOquery)
all.platforms <- read.csv("/data2/HGNChelper/inst/extdata/GPL_list/platform.csv", as.is=TRUE)

# human Entrez Gene identifier symbols
library(org.Hs.eg.db)
hgnc.reference <- as.character(org.Hs.egSYMBOL)   
names(hgnc.reference) <- NULL

2.2 Create gpls_already_tested.csv file

library(HGNChelper)

raw.platform.dir <- "platforms"
dir.create(raw.platform.dir)
hgnc.vec.dir <- "hgnc.vecs"
dir.create(hgnc.vec.dir)

if (file.exists("gpls_already_tested.csv")) {
    gpls.already.tested <- read.csv("gpls_already_tested.csv", header=TRUE)
} else {
    write.table(t(c("platform", "colname", "frac.hgnc", "nrow", 
                    "valid.frac", "valid.after.hgnchelper.frac", 
                    "distribution", "submission_date")),
                file="gpls_already_tested.csv", 
                row.names=FALSE, col.names=FALSE, sep=",")
}

for (i in 1:nrow(all.platforms)){
    gpl <- all.platforms[i, 1]
    hgnc.vec.file <- paste(hgnc.vec.dir, "/", gpl, "_hgnc.vec.RData", sep="")
    
    if(exists("gpls.already.tested") && gpl %in% gpls.already.tested$platform) next
    gpldat <- try(getGEO(gpl, destdir=raw.platform.dir))   
    if(class(gpldat) == "try-error") next
    gpltable <- try(Table(gpldat))
    if(class(gpltable) == "try-error") next
    hgnc.frac <- apply(gpltable, 2, function(x) sum(unique(x) %in% hgnc.reference) / length(unique(x)))
    
    # any column containing gene symbol will have >0 value
    if(any(hgnc.frac > 0.5)) {   # updated from `hgnc.frac > 0`
      
        # assumed that there is only one 'symbol' column
        hgnc.vec <- unique(as.character(gpltable[, which.max(hgnc.frac)]))  
        # get rid of anything after a space
        hgnc.vec <- gsub("[ ].+", "", hgnc.vec)  
        # convert to ascii
        HGNChelper.output <- checkGeneSymbols(iconv(hgnc.vec, "latin1", "ASCII", ""),
                                              map = CurrentHumanMap)  
        
        # fraction of valid gene symbols BEFORE HGNChelper
        valid.frac <- sum(HGNChelper.output$Approved) / length(hgnc.vec)  
        # fraction of valid gene symbols AFTER HGNChelper
        after.HGNChelper.valid.frac <- sum(!is.na(HGNChelper.output$Suggested.Symbol)) / length(hgnc.vec)  
        
        hgnc.vec <- c(gpldat@header$distribution, gpldat@header$submission_date, hgnc.vec)
        save(hgnc.vec, file=hgnc.vec.file, compress="bzip2")
        
        info.for.file <- t(c(gpl, colnames(gpltable)[which.max(hgnc.frac)], 
                             max(hgnc.frac), nrow(gpltable), 
                             valid.frac, after.HGNChelper.valid.frac, 
                             gpldat@header$distribution, 
                             gpldat@header$submission_date))
        
        print(paste(i, gpl, length(hgnc.vec), "HGNC symbols found and saved."))
        
    } else {
       info.for.file <- t(c(gpl, colnames(gpltable)[which.max(hgnc.frac)], 
                            max(hgnc.frac), nrow(gpltable), NA, NA, 
                            gpldat@header$distribution, 
                            gpldat@header$submission_date))
       print(paste(i, gpl, ": No HGNC symbols."))
    }
    
    write.table(info.for.file, file="gpls_already_tested.csv", 
                append=TRUE, row.names=FALSE, col.names=FALSE, sep=",")
}

2.3 Summary plots

2.3.1 Fraction of corrected gene symbols (by original prop. of valid symbols)

wd = "/data2/HGNChelper/inst/analyses"

x <- read.csv(file.path(wd, "gpls_already_tested.csv"), as.is=TRUE)
x$valid.after.hgnchelper.frac <- as.numeric(x$valid.after.hgnchelper.frac)
## Warning: NAs introduced by coercion
x <- x[!is.na(x$valid.frac), ]
x <- x[x$valid.frac > 0, ]

df <- data.frame(before=cut(100*x$valid.frac, breaks=seq(0, 100, by=20)),
                 fixed=(x$valid.after.hgnchelper.frac - x$valid.frac)/(1-x$valid.frac))

par(mar=c(4,5,2,0.5))
boxplot(fixed ~ before, data=df,
        main=paste("Correcting the annotations of", 
                   nrow(df), "GEO platforms"),
        xlab="% Valid Before HGNChelper",
        ylab="Fraction of invalid \n gene symbols fixed",
        col="grey", boxwex=1.1, varwidth=TRUE)

2.3.2 Fraction of valid gene symbols (by submission dates)

wd = "/data2/HGNChelper/inst/analyses"

raw <- read.csv(file.path(wd, "gpls_already_tested.csv"), as.is=TRUE)  # 20,713 raws
raw$valid.after.hgnchelper.frac <- as.numeric(raw$valid.after.hgnchelper.frac)
## Warning: NAs introduced by coercion
raw <- raw[!is.na(raw$valid.frac), ]   # 2,044 rows
raw <- raw[raw$valid.frac > 0.2, ]   # 2,043 rows

raw$year <- as.numeric(gsub(".+[ ]+", "", raw$submission_date))
n.years <- length(unique(raw$year))

boxplot(valid.frac ~ year, data=raw, boxwex=0.3, ylim = c(0.4, 1),
        main="Fig 2. Valid gene symbols for submission year",
        xlab="Submission Year", ylab="Fraction of valid gene symbols", 
        names=sub("20", "'", sort(unique(raw$year))))
boxplot(valid.after.hgnchelper.frac ~ year, data=raw, ylim = c(0.4, 1),
        boxwex=0.3, xaxt='n', at=(1:n.years)+0.35, add=TRUE, col="grey")
legend("bottomleft", legend=c("Before", "After"), pch=c(0, 15), 
       col=c("black", "grey"), lty=-1, bty='n', cex=1)

The number of unique platforms (= each dot in the above plot) each year

table(raw$year)
## 
## 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 
##   34   40  101   88  115  114  141  240  152  150  131  156  128  147  108   77 
## 2018 2019 2020 
##   56   56    9

3 Mouse Gene Symbol

Mouse gene symbols begin with an uppercase letter followed by all lowercase letters except for recessive mutations, which begin with a lowercaase letter.

source('/data2/HGNChelper/R/getCurrentMaps.R')
source('/data2/HGNChelper/R/checkGeneSymbols.R')

currentMouseMap = getCurrentMouseMap()
mouse = c("mug2", "Mug2", "Pzp", "A2m", "SEPT7", "1-Feb")
checkGeneSymbols(mouse, species = "mouse", map = currentMouseMap)
## Warning in checkGeneSymbols(mouse, species = "mouse", map = currentMouseMap): x
## contains non-approved gene symbols
##       x Approved Suggested.Symbol
## 1  mug2    FALSE  Cpamd8 /// Mug2
## 2  Mug2     TRUE             Mug2
## 3   Pzp     TRUE              Pzp
## 4   A2m     TRUE              A2m
## 5 SEPT7    FALSE            Cdc10
## 6 1-Feb    FALSE             Feb1