library(bigMap)
# load aux. stuff 
source('../primes.R')

Load data

load('../P1M.RData')

Run UMAP

Use the script below to run UMAP on a HPC platform with 20 cores and 8GB/core,

# ./UMAP/P1M_start.py: 
# Run UMAP on Primes1M

import numpy as np
import scipy.sparse
import sympy
import json
import time

import umap

# +++ data
max_ints = 1000001
primes = list(sympy.primerange(2, max_ints))
prime_to_column = {p:i for i, p in enumerate(primes)}

X_rows = []
X_data = []
for n in range(2, max_ints):
    f = sympy.factorint(n)
    X_rows.append([prime_to_column[p] for p in f.keys()])
    X_data.append([v for v in f.values()])

X = scipy.sparse.X_matrix((len(X_rows), len(primes)), dtype=np.float32)
X.rows = np.array(X_rows)
X.data = np.array(X_data)

output = {}
# +++ nearest neighbors
for nn in [50, 100, 200, 300, 400, 500]:

  strt = time.time()
  Y = umap.UMAP(n_neighbors = nn, min_dist = .1, metric = 'euclidean', init = 'random', low_memory = False).fit_transform(X)
  t = time.time() -strt

  output[str(nn)] = {'Y': Y.tolist(), 'nn': nn, 't': t}
  # +++ save up to here
  with open('./UMAP/P1M_umap.json', 'w') as g:
          json.dump(output, g)

Submit:

$ qsub -pe make 20 -l h_vmem=8G pySckt ~/UMAP/P1M_start.py

Convert from json to RData

json.list <- rjson::fromJSON(file = './UMAP/P1M_umap.json')
m.list <- lapply(json.list, function(j){
  m <- list(dSet = 'P1M', is.distance = F, is.sparse = T, normalize = T)
  m$ppx <- list(ppx = j$nn)
  m$ptsne <- list(threads = 20, GB = 160, Y = matrix(unlist(j$Y), ncol = 2, byrow = T))
  m$t <- list(umap = j$t)
  m
})
save(m.list, file = './P1M_umap.RData')
# load m.list 
load('./P1M_umap.RData')

Output

nulL <- lapply(m.list, function(m) primes.plot(P1M, m))

hl-Correlation

hlTable <- sapply(m.list, function(m) summary(m$hlC)[4])
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
  kable_styling(full_width = F)
hl-Correlation
50 100 200 300 400 500
<hlC> -0.0887 -0.0471 0.0104 0.1226 0.1021 0.2076

k-ary neighborhood preservation

bdm.knp.plot(m.list, ppxfrmt = 0)

Running Times

rTimes <- sapply(m.list, function(m) m$t$umap)
rTimes <- matrix(round(rTimes /60, 2), nrow = 1)
colnames(rTimes) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(rTimes) <- c('UMAP')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
  kable_styling(full_width = F)
Computation times (min)
50 100 200 300 400 500
UMAP 128.92 329.45 796.83 1063.07 1107.37 1160.35

Run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 20 cores, 8GB/core RAM.