library(bigMap)
# load aux. stuff
source('../primes.R')
load('../P1M.RData')
Use the script below to run UMAP on a HPC platform with 20 cores and 8GB/core,
# ./UMAP/P1M_start.py:
# Run UMAP on Primes1M
import numpy as np
import scipy.sparse
import sympy
import json
import time
import umap
# +++ data
max_ints = 1000001
primes = list(sympy.primerange(2, max_ints))
prime_to_column = {p:i for i, p in enumerate(primes)}
X_rows = []
X_data = []
for n in range(2, max_ints):
f = sympy.factorint(n)
X_rows.append([prime_to_column[p] for p in f.keys()])
X_data.append([v for v in f.values()])
X = scipy.sparse.X_matrix((len(X_rows), len(primes)), dtype=np.float32)
X.rows = np.array(X_rows)
X.data = np.array(X_data)
output = {}
# +++ nearest neighbors
for nn in [50, 100, 200, 300, 400, 500]:
strt = time.time()
Y = umap.UMAP(n_neighbors = nn, min_dist = .1, metric = 'euclidean', init = 'random', low_memory = False).fit_transform(X)
t = time.time() -strt
output[str(nn)] = {'Y': Y.tolist(), 'nn': nn, 't': t}
# +++ save up to here
with open('./UMAP/P1M_umap.json', 'w') as g:
json.dump(output, g)
Submit:
$ qsub -pe make 20 -l h_vmem=8G pySckt ~/UMAP/P1M_start.py
json.list <- rjson::fromJSON(file = './UMAP/P1M_umap.json')
m.list <- lapply(json.list, function(j){
m <- list(dSet = 'P1M', is.distance = F, is.sparse = T, normalize = T)
m$ppx <- list(ppx = j$nn)
m$ptsne <- list(threads = 20, GB = 160, Y = matrix(unlist(j$Y), ncol = 2, byrow = T))
m$t <- list(umap = j$t)
m
})
save(m.list, file = './P1M_umap.RData')
# load m.list
load('./P1M_umap.RData')
nulL <- lapply(m.list, function(m) primes.plot(P1M, m))
hlTable <- sapply(m.list, function(m) summary(m$hlC)[4])
hlTable <- matrix(round(hlTable, 4), nrow = 1)
colnames(hlTable) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(hlTable) <- c('<hlC>')
knitr::kable(hlTable, caption = 'hl-Correlation') %>%
kable_styling(full_width = F)
| 50 | 100 | 200 | 300 | 400 | 500 | |
|---|---|---|---|---|---|---|
| <hlC> | -0.0887 | -0.0471 | 0.0104 | 0.1226 | 0.1021 | 0.2076 |
bdm.knp.plot(m.list, ppxfrmt = 0)
rTimes <- sapply(m.list, function(m) m$t$umap)
rTimes <- matrix(round(rTimes /60, 2), nrow = 1)
colnames(rTimes) <- sapply(m.list, function(m) m$ppx$ppx)
rownames(rTimes) <- c('UMAP')
knitr::kable(rTimes, caption = 'Computation times (min)') %>%
kable_styling(full_width = F)
| 50 | 100 | 200 | 300 | 400 | 500 | |
|---|---|---|---|---|---|---|
| UMAP | 128.92 | 329.45 | 796.83 | 1063.07 | 1107.37 | 1160.35 |
Run on: Intel(R) Xeon(R) CPU E5-2650 v3 2.30GHz, 32Mb cache, 20 cores, 8GB/core RAM.